diff --git a/optimum/exporters/openvino/export_paraformer.py b/optimum/exporters/openvino/export_paraformer.py
new file mode 100644
index 0000000000..302d1b65ef
--- /dev/null
+++ b/optimum/exporters/openvino/export_paraformer.py
@@ -0,0 +1,211 @@
+#  Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+Standalone Paraformer Export Script for OpenVINO
+
+This module provides a standalone export function for Paraformer ASR models
+to OpenVINO format, independent of the main optimum-intel export pipeline.
+
+Usage:
+    python -m optimum.exporters.openvino.export_paraformer \
+        --model /path/to/paraformer/model \
+        --output /path/to/output \
+        --int8  # optional, for INT8 weight compression
+
+Or programmatically:
+    from optimum.exporters.openvino.export_paraformer import export_paraformer
+    export_paraformer(model_path, output_path, compress_int8=True)
+"""
+
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def export_paraformer(
+    model_name_or_path: Union[str, Path],
+    output: Union[str, Path],
+    device: str = "cpu",
+    compress_int8: bool = False,
+    compress_fp16: bool = True,
+) -> None:
+    """
+    Export a Paraformer ASR model to OpenVINO format.
+    
+    This is a standalone export function that doesn't require modifications
+    to the main optimum-intel export pipeline.
+    
+    Args:
+        model_name_or_path: Path to the Paraformer model directory
+        output: Output directory for the exported model
+        device: Device to use for export (default: "cpu")
+        compress_int8: Apply INT8 symmetric weight compression (default: False)
+        compress_fp16: Store FP32 constants as FP16 (default: True, recommended for GPU)
+    
+    Returns:
+        None
+        
+    Example:
+        >>> from optimum.exporters.openvino.export_paraformer import export_paraformer
+        >>> export_paraformer(
+        ...     "/path/to/paraformer/model",
+        ...     "/path/to/output",
+        ...     compress_int8=True
+        ... )
+    """
+    import openvino as ov
+    
+    # Import paraformer modeling (lazy import to avoid dependency issues)
+    from optimum.exporters.openvino.modeling_paraformer import build_model, export
+    
+    model_path = str(model_name_or_path)
+    output_path = Path(output)
+    
+    logger.info(f"Exporting Paraformer model from {model_path}")
+    logger.info(f"Output directory: {output_path}")
+    
+    # Build the model
+    model, kwargs = build_model(model=model_path, device=device)
+    
+    # Export to TorchScript
+    model_dir, model_jit_scripts = export(model, kwargs, type="torchscript", quantize=False, device=device)
+    
+    # Convert to OpenVINO
+    ovm = ov.convert_model(model_jit_scripts, input=[([-1, -1, -1], torch.float32), ([-1], torch.int32)])
+    
+    # Create output directory
+    target_dir = output_path / "ov_models"
+    target_dir.mkdir(parents=True, exist_ok=True)
+    
+    output_model_path = target_dir / "openvino_model.xml"
+    
+    # Apply INT8 weight compression if requested
+    if compress_int8:
+        try:
+            from nncf import compress_weights, CompressWeightsMode
+            logger.info("Applying INT8 weight compression (symmetric)...")
+            # INT8_SYM: no zero-point bias ops → significantly faster on GPU
+            ovm = compress_weights(ovm, mode=CompressWeightsMode.INT8_SYM)
+            logger.info("Weight compression complete.")
+        except ImportError:
+            logger.warning("NNCF not available. Skipping INT8 compression. Install with: pip install nncf")
+    
+    # Save the model
+    if compress_fp16:
+        # compress_to_fp16=True: stores remaining FP32 constants as FP16
+        # → avoids a second FP32→FP16 conversion pass on GPU at runtime
+        ov.save_model(ovm, str(output_model_path), compress_to_fp16=True)
+        logger.info(f"Model saved with FP16 compression to {output_model_path}")
+    else:
+        ov.serialize(ovm, str(output_model_path))
+        logger.info(f"Model saved to {output_model_path}")
+    
+    # Copy model parameter files
+    PARAFORMER_PARAM_FILES = ['am.mvn', 'config.yaml', 'configuration.json', 'seg_dict', 'tokens.json']
+    
+    for file_name in PARAFORMER_PARAM_FILES:
+        source_file = os.path.join(model_dir, file_name)
+        target_file = target_dir / file_name
+        if os.path.exists(source_file):
+            shutil.copy2(source_file, target_file)
+            logger.debug(f"Copied {file_name}")
+    
+    logger.info(f"Export complete. Model saved to {target_dir}")
+    
+    return model, kwargs
+
+
+def main():
+    """Command-line interface for Paraformer export."""
+    parser = argparse.ArgumentParser(
+        description="Export Paraformer ASR model to OpenVINO format",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic export
+  python -m optimum.exporters.openvino.export_paraformer \\
+      --model /path/to/paraformer \\
+      --output /path/to/output
+
+  # Export with INT8 compression
+  python -m optimum.exporters.openvino.export_paraformer \\
+      --model /path/to/paraformer \\
+      --output /path/to/output \\
+      --int8
+        """
+    )
+    
+    parser.add_argument(
+        "--model", "-m",
+        type=str,
+        required=True,
+        help="Path to the Paraformer model directory"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=str,
+        required=True,
+        help="Output directory for the exported model"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use for export (default: cpu)"
+    )
+    parser.add_argument(
+        "--int8",
+        action="store_true",
+        help="Apply INT8 symmetric weight compression"
+    )
+    parser.add_argument(
+        "--no-fp16",
+        action="store_true",
+        help="Disable FP16 compression for constants"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    
+    args = parser.parse_args()
+    
+    # Setup logging
+    log_level = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    
+    # Run export
+    export_paraformer(
+        model_name_or_path=args.model,
+        output=args.output,
+        device=args.device,
+        compress_int8=args.int8,
+        compress_fp16=not args.no_fp16,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0624624a77..c023f4879d 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -198,6 +198,7 @@
     Qwen3NextModelPatcher,
     Qwen3VLLanguageModelPatcher,
     Qwen3VLVisionEmbMergerPatcher,
+    ParaformerModelPatcher,
     QwenModelPatcher,
     SanaTextEncoderModelPatcher,
     XverseModelPatcher,
@@ -5451,3 +5452,127 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
                 )
 
         return dummy_inputs
+
+
+# ============================================================================
+# Paraformer ASR Model Support
+# ============================================================================
+# Registration for FunASR Paraformer models for automatic speech recognition
+# This allows export via: optimum-cli export openvino --model funasr/paraformer-zh
+
+# Import Paraformer model and configuration from modeling_paraformer
+try:
+    from .modeling_paraformer import (
+        ParaformerForASR,
+        ParaformerConfig,
+        _load_paraformer_model,
+    )
+
+    # Register paraformer library with TasksManager
+    if "paraformer" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
+        TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["paraformer"] = {
+            "paraformer": {
+                "automatic-speech-recognition": ("ParaformerForASR",),
+            }
+        }
+
+    # Register model loader for paraformer library
+    if "paraformer" not in TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP:
+        TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP["paraformer"] = {
+            "automatic-speech-recognition": _load_paraformer_model,
+        }
+
+    # Also register as custom class to avoid library import issues
+    TasksManager._CUSTOM_CLASSES[("pt", "paraformer", "automatic-speech-recognition")] = (
+        "optimum.exporters.openvino.modeling_paraformer",
+        "ParaformerForASR",
+    )
+
+    PARAFORMER_AVAILABLE = True
+except ImportError:
+    PARAFORMER_AVAILABLE = False
+    logger.debug("Paraformer support not available - modeling_paraformer module not found")
+
+# Import paraformer_plugin to hook into main_export for non-standard library support
+# This is necessary because 'paraformer' is a FunASR library, not a transformers library
+try:
+    from . import paraformer_plugin  # noqa: F401
+except ImportError:
+    pass  # Paraformer dependencies not available
+
+
+class ParaformerDummyAudioInputGenerator(DummyInputGenerator):
+    """
+    Generates dummy audio inputs for Paraformer model export.
+    """
+    SUPPORTED_INPUT_NAMES = ("speech", "speech_lengths")
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        """
+        Generates dummy audio features and lengths for Paraformer.
+        
+        Args:
+            input_name: Name of the input ("speech" or "speech_lengths")
+            framework: Framework to use (default: "pt")
+            int_dtype: Integer dtype
+            float_dtype: Float dtype
+            
+        Returns:
+            Dummy tensor for the specified input
+        """
+        if input_name == "speech":
+            # Paraformer expects speech features: (batch_size, feature_length, feature_dim)
+            # Typical feature_dim is 560 (80 mel features * 7 LFR stacking)
+            batch_size = self.batch_size
+            feature_length = 30  # Example length
+            feature_dim = 560
+            return self.random_float_tensor(
+                shape=(batch_size, feature_length, feature_dim),
+                min_value=-1.0,
+                max_value=1.0,
+                framework=framework,
+                dtype=float_dtype
+            )
+        elif input_name == "speech_lengths":
+            # Generate realistic speech lengths for the batch
+            return self.random_int_tensor(
+                shape=(self.batch_size,),
+                max_value=30,
+                min_value=6,
+                framework=framework,
+                dtype="int32"  # Paraformer uses int32 for lengths
+            )
+
+
+@register_in_tasks_manager(
+    "paraformer",
+    *["automatic-speech-recognition"],
+    library_name="transformers",
+)
+class ParaformerOpenVINOConfig(OnnxConfig):
+    """
+    OpenVINO configuration for Paraformer ASR models.
+    """
+    DEFAULT_ONNX_OPSET = 14
+    DUMMY_INPUT_GENERATOR_CLASSES = (ParaformerDummyAudioInputGenerator,)
+    _MODEL_PATCHER = ParaformerModelPatcher
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        """
+        Define model inputs with dynamic axes.
+        """
+        return {
+            "speech": {0: "batch_size", 1: "feats_length"},
+            "speech_lengths": {0: "batch_size"},
+        }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        """
+        Define model outputs with dynamic axes.
+        """
+        return {
+            "logits": {0: "batch_size", 1: "logits_length"},
+            "token_num": {0: "batch_size"},
+        }
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 32dd2d6c6d..d0b2402577 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -8319,3 +8319,30 @@ def __exit__(self, exc_type, exc_value, traceback):
                 sparse_moe_block = decoder_layer.mlp
                 decoder_layer.mlp.forward = decoder_layer.mlp._orig_forward
                 del sparse_moe_block.down_projs, sparse_moe_block.gate_projs, sparse_moe_block.up_projs
+
+
+class ParaformerModelPatcher(ModelPatcher):
+    """
+    Model patcher for Paraformer ASR models.
+    Applies necessary modifications for export to OpenVINO format.
+    """
+
+    def __enter__(self):
+        super().__enter__()
+        
+        # Import the export_rebuild_model function from modeling_paraformer
+        try:
+            from .modeling_paraformer import export_rebuild_model
+        except ImportError:
+            logger.warning("Could not import export_rebuild_model from modeling_paraformer")
+            return self
+        
+        # Apply the export modifications
+        max_seq_len = self._config.values.get("max_seq_len", 512)
+        export_rebuild_model(self._model, max_seq_len=max_seq_len, device="cpu", type="onnx")
+        
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+
diff --git a/optimum/exporters/openvino/modeling_paraformer.py b/optimum/exporters/openvino/modeling_paraformer.py
new file mode 100644
index 0000000000..8376dc9113
--- /dev/null
+++ b/optimum/exporters/openvino/modeling_paraformer.py
@@ -0,0 +1,2315 @@
+import torch
+from torch import nn
+import logging
+from typing import Dict, Optional, List, Tuple, Union
+from pathlib import Path
+import numpy as np
+import types
+import math
+import os
+import json
+import copy
+
+# Optional dependency: omegaconf is only required for loading FunASR-style configs
+try:
+    from omegaconf import OmegaConf, DictConfig, ListConfig
+    _OMEGACONF_AVAILABLE = True
+except ImportError:
+    OmegaConf = None
+    DictConfig = None
+    ListConfig = None
+    _OMEGACONF_AVAILABLE = False
+
+# Optional dependency: loralib for LoRA fine-tuning
+try:
+    import loralib as lora
+    _LORA_AVAILABLE = True
+except ImportError:
+    lora = None
+    _LORA_AVAILABLE = False
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/utils/repeat.py#L14 (Apache 2.0)
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential."""
+
+    def __init__(self, *args, layer_drop_rate=0.0):
+        """Initialize MultiSequential with layer_drop.
+
+        Args:
+            layer_drop_rate (float): Probability of dropping out each fn (layer).
+
+        """
+        super(MultiSequential, self).__init__(*args)
+        self.layer_drop_rate = layer_drop_rate
+
+    def forward(self, *args):
+        """Repeat."""
+        _probs = torch.empty(len(self)).uniform_()
+        for idx, m in enumerate(self):
+            if not self.training or (_probs[idx] >= self.layer_drop_rate):
+                args = m(*args)
+        return args
+
+def repeat(N, fn, layer_drop_rate=0.0):
+    """Repeat module N times.
+
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.
+        layer_drop_rate (float): Probability of dropping out each fn (layer).
+
+    Returns:
+        MultiSequential: Repeated model instance.
+
+    """
+    return MultiSequential(*[fn(n) for n in range(N)], layer_drop_rate=layer_drop_rate)
+
+# https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/positionwise_feed_forward.py#L14 (Apache 2.0)
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/embedding.py#L416 (Apache 2.0)
+class StreamSinusoidalPositionEncoder(torch.nn.Module):
+    """ """
+
+    def __init__(self, d_model=80, dropout_rate=0.1):
+        super().__init__()
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/embedding.py#L383 (Apache 2.0)
+class SinusoidalPositionEncoder(torch.nn.Module):
+    """ """
+
+    def __init__(self, d_model=80, dropout_rate=0.1):
+        super().__init__()
+
+    def encode(
+        self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32
+    ):
+        batch_size = positions.size(0)
+        positions = positions.type(dtype)
+        device = positions.device
+        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (
+            depth / 2 - 1
+        )
+        inv_timescales = torch.exp(
+            torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment)
+        )
+        inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(
+            inv_timescales, [1, 1, -1]
+        )
+        encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
+        return encoding.type(dtype)
+
+    def forward(self, x):
+        batch_size, timesteps, input_dim = x.size()
+        positions = torch.arange(1, timesteps + 1, dtype=torch.int32, device=x.device)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+
+        return x + position_encoding
+
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward compatibility.
+
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/embedding.py#L36 (Apache 2.0)
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position. Only for
+        the class LegacyRelPositionalEncoding. We remove it in the current
+        class RelPositionalEncoding.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """Construct an PositionalEncoding object."""
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/transformer/layer_norm.py#L13 (Apache 2.0)
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
+
+    """
+
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+
+class BaseTransformerDecoder(nn.Module):
+    """Base class of Transfomer decoder module.
+
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the number of units of position-wise feed forward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before: whether to use layer_norm before the first block
+        concat_after: whether to concat attention layer's input and output
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied.
+            i.e. x -> x + att(x)
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        pos_enc_class=PositionalEncoding,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+        attention_dim = encoder_output_size
+
+        if input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(vocab_size, attention_dim),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        elif input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(vocab_size, attention_dim),
+                torch.nn.LayerNorm(attention_dim),
+                torch.nn.Dropout(dropout_rate),
+                torch.nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate),
+            )
+        else:
+            raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
+
+        self.normalize_before = normalize_before
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = None
+
+        # Must set by the inheritance
+        self.decoders = None
+
+class sequence_mask(nn.Module):
+    def __init__(self, max_seq_len=512, flip=True):
+        super().__init__()
+
+    def forward(self, lengths, max_seq_len=None, dtype=torch.float32, device=None):
+        if max_seq_len is None:
+            max_seq_len = lengths.max()
+        row_vector = torch.arange(0, max_seq_len, 1, dtype=torch.int32, device=lengths.device)
+        matrix = torch.unsqueeze(lengths, dim=-1).to(torch.int32)
+        mask = row_vector < matrix
+
+        return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
+
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/multihead_att.py#L67
+def preprocess_for_attn(x, mask, cache, pad_fn, kernel_size):
+    x = x * mask
+    x = x.transpose(1, 2)
+    if cache is None:
+        x = pad_fn(x)
+    else:
+        x = torch.cat((cache, x), dim=2)
+        cache = x[:, :, -(kernel_size - 1) :]
+    return x, cache
+
+
+# torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
+# if torch_version >= (1, 8):
+#     import torch.fx
+
+#     torch.fx.wrap("preprocess_for_attn")
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L140 (Apache 2.0)
+class MultiHeadedAttentionSANM(nn.Module):
+    """Multi-Head Attention layer.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(
+        self,
+        n_head,
+        in_feat,
+        n_feat,
+        dropout_rate,
+        kernel_size,
+        sanm_shfit=0,
+        lora_list=None,
+        lora_rank=8,
+        lora_alpha=16,
+        lora_dropout=0.1,
+    ):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        # self.linear_q = nn.Linear(n_feat, n_feat)
+        # self.linear_k = nn.Linear(n_feat, n_feat)
+        # self.linear_v = nn.Linear(n_feat, n_feat)
+        if lora_list is not None:
+            if not _LORA_AVAILABLE:
+                raise ImportError(
+                    "LoRA layers require the 'loralib' package. "
+                    "Please install it with: pip install loralib"
+                )
+            if "o" in lora_list:
+                self.linear_out = lora.Linear(
+                    n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout
+                )
+            else:
+                self.linear_out = nn.Linear(n_feat, n_feat)
+            lora_qkv_list = ["q" in lora_list, "k" in lora_list, "v" in lora_list]
+            if lora_qkv_list == [False, False, False]:
+                self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+            else:
+                self.linear_q_k_v = lora.MergedLinear(
+                    in_feat,
+                    n_feat * 3,
+                    r=lora_rank,
+                    lora_alpha=lora_alpha,
+                    lora_dropout=lora_dropout,
+                    enable_lora=lora_qkv_list,
+                )
+        else:
+            self.linear_out = nn.Linear(n_feat, n_feat)
+            self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+        attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.fsmn_block = nn.Conv1d(
+            n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
+        )
+        # padding
+        left_padding = (kernel_size - 1) // 2
+        if sanm_shfit > 0:
+            left_padding = left_padding + sanm_shfit
+        right_padding = kernel_size - 1 - left_padding
+        self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
+
+    def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+        b, t, d = inputs.size()
+        if mask is not None:
+            mask = torch.reshape(mask, (b, -1, 1))
+            if mask_shfit_chunk is not None:
+                mask = mask * mask_shfit_chunk
+            inputs = inputs * mask
+
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        x = self.dropout(x)
+        if mask is not None:
+            x = x * mask
+        return x
+
+    def forward_qkv(self, x):
+        """Transform query, key and value.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+
+        """
+        b, t, d = x.size()
+        q_k_v = self.linear_q_k_v(x)
+        q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+        q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time1, d_k)
+        k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+
+        return q_h, k_h, v_h, v
+
+    def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None):
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            if mask_att_chunk_encoder is not None:
+                mask = mask * mask_att_chunk_encoder
+
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+
+            min_value = -float(
+                "inf"
+            )  # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
+        return att_outs + fsmn_memory
+
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        if chunk_size is not None and look_back > 0 or look_back == -1:
+            if cache is not None:
+                k_h_stride = k_h[:, :, : -(chunk_size[2]), :]
+                v_h_stride = v_h[:, :, : -(chunk_size[2]), :]
+                k_h = torch.cat((cache["k"], k_h), dim=2)
+                v_h = torch.cat((cache["v"], v_h), dim=2)
+
+                cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
+                cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
+                if look_back != -1:
+                    cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :]
+                    cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :]
+            else:
+                cache_tmp = {
+                    "k": k_h[:, :, : -(chunk_size[2]), :],
+                    "v": v_h[:, :, : -(chunk_size[2]), :],
+                }
+                cache = cache_tmp
+        fsmn_memory = self.forward_fsmn(v, None)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, None)
+        return att_outs + fsmn_memory, cache
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L353 (Apache 2.0)
+class MultiHeadedAttentionSANMExport(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.d_k = model.d_k
+        self.h = model.h
+        self.linear_out = model.linear_out
+        self.linear_q_k_v = model.linear_q_k_v
+        self.fsmn_block = model.fsmn_block
+        self.pad_fn = model.pad_fn
+
+        self.attn = None
+        self.all_head_size = self.h * self.d_k
+
+    def forward(self, x, mask):
+        mask_3d_btd, mask_4d_bhlt = mask
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        fsmn_memory = self.forward_fsmn(v, mask_3d_btd)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, mask_4d_bhlt)
+        return att_outs + fsmn_memory
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.h, self.d_k)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward_qkv(self, x):
+        q_k_v = self.linear_q_k_v(x)
+        q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+        q_h = self.transpose_for_scores(q)
+        k_h = self.transpose_for_scores(k)
+        v_h = self.transpose_for_scores(v)
+        return q_h, k_h, v_h, v
+
+    def forward_fsmn(self, inputs, mask):
+        # b, t, d = inputs.size()
+        # mask = torch.reshape(mask, (b, -1, 1))
+        inputs = inputs * mask
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x = x + inputs
+        x = x * mask
+        return x
+
+    def forward_attention(self, value, scores, mask):
+        scores = scores + mask
+
+        attn = torch.softmax(scores, dim=-1)
+        context_layer = torch.matmul(attn, value)  # (batch, head, time1, d_k)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        return self.linear_out(context_layer)  # (batch, time1, d_model)
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L471 (Apache 2.0)
+class MultiHeadedAttentionSANMDecoder(nn.Module):
+    """Multi-Head Attention layer.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, n_feat, dropout_rate, kernel_size, sanm_shfit=0):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.fsmn_block = nn.Conv1d(
+            n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
+        )
+        # padding
+        # padding
+        left_padding = (kernel_size - 1) // 2
+        if sanm_shfit > 0:
+            left_padding = left_padding + sanm_shfit
+        right_padding = kernel_size - 1 - left_padding
+        self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
+        self.kernel_size = kernel_size
+
+    def forward(self, inputs, mask, cache=None, mask_shfit_chunk=None):
+        """
+        :param x: (#batch, time1, size).
+        :param mask: Mask tensor (#batch, 1, time)
+        :return:
+        """
+        # print("in fsmn, inputs", inputs.size())
+        b, t, d = inputs.size()
+        # logging.info(
+        #     "mask: {}".format(mask.size()))
+        if mask is not None:
+            mask = torch.reshape(mask, (b, -1, 1))
+            # logging.info("in fsmn, mask: {}, {}".format(mask.size(), mask[0:100:50, :, :]))
+            if mask_shfit_chunk is not None:
+                # logging.info("in fsmn, mask_fsmn: {}, {}".format(mask_shfit_chunk.size(), mask_shfit_chunk[0:100:50, :, :]))
+                mask = mask * mask_shfit_chunk
+            # logging.info("in fsmn, mask_after_fsmn: {}, {}".format(mask.size(), mask[0:100:50, :, :]))
+            # print("in fsmn, mask", mask.size())
+            # print("in fsmn, inputs", inputs.size())
+            inputs = inputs * mask
+
+        x = inputs.transpose(1, 2)
+        b, d, t = x.size()
+        if cache is None:
+            # print("in fsmn, cache is None, x", x.size())
+
+            x = self.pad_fn(x)
+            if not self.training:
+                cache = x
+        else:
+            # print("in fsmn, cache is not None, x", x.size())
+            # x = torch.cat((x, cache), dim=2)[:, :, :-1]
+            # if t < self.kernel_size:
+            #     x = self.pad_fn(x)
+            x = torch.cat((cache[:, :, 1:], x), dim=2)
+            x = x[:, :, -(self.kernel_size + t - 1) :]
+            # print("in fsmn, cache is not None, x_cat", x.size())
+            cache = x
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        # print("in fsmn, fsmn_out", x.size())
+        if x.size(1) != inputs.size(1):
+            inputs = inputs[:, -1, :]
+
+        x = x + inputs
+        x = self.dropout(x)
+        if mask is not None:
+            x = x * mask
+        return x, cache
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L550 (Apache 2.0)
+class MultiHeadedAttentionSANMDecoderExport(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.fsmn_block = model.fsmn_block
+        self.pad_fn = model.pad_fn
+        self.kernel_size = model.kernel_size
+        self.attn = None
+
+    def forward(self, inputs, mask, cache=None):
+        x, cache = preprocess_for_attn(inputs, mask, cache, self.pad_fn, self.kernel_size)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+
+        x = x + inputs
+        x = x * mask
+        return x, cache
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L568 (Apache 2.0)
+class MultiHeadedAttentionCrossAtt(nn.Module):
+    """Multi-Head Attention layer.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        lora_list=None,
+        lora_rank=8,
+        lora_alpha=16,
+        lora_dropout=0.1,
+        encoder_output_size=None,
+    ):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        if lora_list is not None:
+            if "q" in lora_list:
+                self.linear_q = lora.Linear(
+                    n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout
+                )
+            else:
+                self.linear_q = nn.Linear(n_feat, n_feat)
+            lora_kv_list = ["k" in lora_list, "v" in lora_list]
+            if lora_kv_list == [False, False]:
+                self.linear_k_v = nn.Linear(
+                    n_feat if encoder_output_size is None else encoder_output_size, n_feat * 2
+                )
+            else:
+                self.linear_k_v = lora.MergedLinear(
+                    n_feat if encoder_output_size is None else encoder_output_size,
+                    n_feat * 2,
+                    r=lora_rank,
+                    lora_alpha=lora_alpha,
+                    lora_dropout=lora_dropout,
+                    enable_lora=lora_kv_list,
+                )
+            if "o" in lora_list:
+                self.linear_out = lora.Linear(
+                    n_feat, n_feat, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout
+                )
+            else:
+                self.linear_out = nn.Linear(n_feat, n_feat)
+        else:
+            self.linear_q = nn.Linear(n_feat, n_feat)
+            self.linear_k_v = nn.Linear(
+                n_feat if encoder_output_size is None else encoder_output_size, n_feat * 2
+            )
+            self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward_qkv(self, x, memory):
+        """Transform query, key and value.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+
+        """
+
+        # print("in forward_qkv, x", x.size())
+        b = x.size(0)
+        q = self.linear_q(x)
+        q_h = torch.reshape(q, (b, -1, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time1, d_k)
+
+        k_v = self.linear_k_v(memory)
+        k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1)
+        k_h = torch.reshape(k, (b, -1, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        v_h = torch.reshape(v, (b, -1, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+
+        return q_h, k_h, v_h
+
+    def forward_attention(self, value, scores, mask, ret_attn=False):
+        """Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = -float(
+                "inf"
+            )  # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            # logging.info(
+            #     "scores: {}, mask_size: {}".format(scores.size(), mask.size()))
+            scores = scores.masked_fill(mask, min_value)
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+        if ret_attn:
+            return self.linear_out(x), attn  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self, x, memory, memory_mask, ret_attn=False):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q_h, k_h, v_h = self.forward_qkv(x, memory)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        return self.forward_attention(v_h, scores, memory_mask, ret_attn=ret_attn)
+
+    def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0):
+        """Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+
+        """
+        q_h, k_h, v_h = self.forward_qkv(x, memory)
+        if chunk_size is not None and look_back > 0:
+            if cache is not None:
+                k_h = torch.cat((cache["k"], k_h), dim=2)
+                v_h = torch.cat((cache["v"], v_h), dim=2)
+                cache["k"] = k_h[:, :, -(look_back * chunk_size[1]) :, :]
+                cache["v"] = v_h[:, :, -(look_back * chunk_size[1]) :, :]
+            else:
+                cache_tmp = {
+                    "k": k_h[:, :, -(look_back * chunk_size[1]) :, :],
+                    "v": v_h[:, :, -(look_back * chunk_size[1]) :, :],
+                }
+                cache = cache_tmp
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        return self.forward_attention(v_h, scores, None), cache
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/attention.py#L751 (Apache 2.0)
+class MultiHeadedAttentionCrossAttExport(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.d_k = model.d_k
+        self.h = model.h
+        self.linear_q = model.linear_q
+        self.linear_k_v = model.linear_k_v
+        self.linear_out = model.linear_out
+        self.attn = None
+        self.all_head_size = self.h * self.d_k
+
+    def forward(self, x, memory, memory_mask, ret_attn=False):
+        q, k, v = self.forward_qkv(x, memory)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, memory_mask, ret_attn)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.h, self.d_k)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward_qkv(self, x, memory):
+        q = self.linear_q(x)
+
+        k_v = self.linear_k_v(memory)
+        k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1)
+        q = self.transpose_for_scores(q)
+        k = self.transpose_for_scores(k)
+        v = self.transpose_for_scores(v)
+        return q, k, v
+
+    def forward_attention(self, value, scores, mask, ret_attn):
+        scores = scores + mask.to(scores.device)
+
+        attn = torch.softmax(scores, dim=-1)
+        context_layer = torch.matmul(attn, value)  # (batch, head, time1, d_k)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        if ret_attn:
+            return self.linear_out(context_layer), attn
+        return self.linear_out(context_layer)  # (batch, time1, d_model)
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L44 (MIT License)
+class EncoderLayerSANM(nn.Module):
+    def __init__(
+        self,
+        in_size,
+        size,
+        self_attn,
+        feed_forward,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+        stochastic_depth_rate=0.0,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayerSANM, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(in_size)
+        self.norm2 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.in_size = in_size
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.dropout_rate = dropout_rate
+
+    def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            return x, mask
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if self.concat_after:
+            x_concat = torch.cat(
+                (
+                    x,
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    ),
+                ),
+                dim=-1,
+            )
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+            else:
+                x = stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+            else:
+                x = stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute encoded features.
+
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+
+        if self.in_size == self.size:
+            attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+            x = residual + attn
+        else:
+            x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.feed_forward(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, cache
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L188 (MIT License)
+class SANMEncoder(nn.Module):
+    """
+    Author: Zhifu Gao, Shiliang Zhang, Ming Lei, Ian McLoughlin
+    San-m: Memory equipped self-attention for end-to-end speech recognition
+    https://arxiv.org/abs/2006.01713
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        pos_enc_class=SinusoidalPositionEncoder,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 1,
+        padding_idx: int = -1,
+        interctc_layer_idx: List[int] = [],
+        interctc_use_conditioning: bool = False,
+        kernel_size: int = 11,
+        sanm_shfit: int = 0,
+        lora_list: List[str] = None,
+        lora_rank: int = 8,
+        lora_alpha: int = 16,
+        lora_dropout: float = 0.1,
+        selfattention_layer_type: str = "sanm",
+        tf2torch_tensor_name_prefix_torch: str = "encoder",
+        tf2torch_tensor_name_prefix_tf: str = "seq2seq/encoder",
+    ):
+        super().__init__()
+        self._output_size = output_size
+        # input_layer is now force to set to "pe"
+        self.embed = SinusoidalPositionEncoder()
+        self.normalize_before = normalize_before
+
+        # positionwise_layer_type is now force to set to "linear"
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+        )
+
+        # selfattention_layer_type is now force to set to "sanm"
+        encoder_selfattn_layer = MultiHeadedAttentionSANM
+        encoder_selfattn_layer_args0 = (
+            attention_heads,
+            input_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+            lora_list,
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+        )
+
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+            lora_list,
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+        )
+
+        self.encoders0 = repeat(
+            1,
+            lambda lnum: EncoderLayerSANM(
+                input_size,
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args0),
+                positionwise_layer(*positionwise_layer_args),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        self.encoders = repeat(
+            num_blocks - 1,
+            lambda lnum: EncoderLayerSANM(
+                output_size,
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        self.after_norm = LayerNorm(output_size)
+
+        self.interctc_layer_idx = interctc_layer_idx
+
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+        self.dropout = nn.Dropout(dropout_rate)
+        self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch
+        self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf
+
+    def output_size(self) -> int:
+        return self._output_size
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L487 (MIT License)
+class EncoderLayerSANMExport(nn.Module):
+    def __init__(
+        self,
+        model,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = model.self_attn
+        self.feed_forward = model.feed_forward
+        self.norm1 = model.norm1
+        self.norm2 = model.norm2
+        self.in_size = model.in_size
+        self.size = model.size
+
+    def forward(self, x, mask):
+
+        residual = x
+        x = self.norm1(x)
+        x = self.self_attn(x, mask)
+        if self.in_size == self.size:
+            x = x + residual
+        residual = x
+        x = self.norm2(x)
+        x = self.feed_forward(x)
+        x = x + residual
+
+        return x, mask
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/encoder.py#L518 (MIT License)
+class SANMEncoderExport(nn.Module):
+    def __init__(
+        self,
+        model,
+        max_seq_len=512,
+        feats_dim=560,
+        model_name="encoder",
+        onnx: bool = True,
+        ctc_linear: nn.Module = None,
+    ):
+        super().__init__()
+        self.embed = model.embed
+        if isinstance(self.embed, StreamSinusoidalPositionEncoder):
+            self.embed = None
+        self.model = model
+        self.feats_dim = feats_dim
+        self._output_size = model._output_size
+
+        self.make_pad_mask = sequence_mask(max_seq_len, flip=False)
+
+        # from export_model_hf.sanm.attention import MultiHeadedAttentionSANMExport
+
+        if hasattr(model, "encoders0"):
+            for i, d in enumerate(self.model.encoders0):
+                if isinstance(d.self_attn, MultiHeadedAttentionSANM):
+                    d.self_attn = MultiHeadedAttentionSANMExport(d.self_attn)
+                self.model.encoders0[i] = EncoderLayerSANMExport(d)
+
+        for i, d in enumerate(self.model.encoders):
+            if isinstance(d.self_attn, MultiHeadedAttentionSANM):
+                d.self_attn = MultiHeadedAttentionSANMExport(d.self_attn)
+            self.model.encoders[i] = EncoderLayerSANMExport(d)
+
+        self.model_name = model_name
+        self.num_heads = model.encoders[0].self_attn.h
+        self.hidden_size = model.encoders[0].self_attn.linear_out.out_features
+
+        self.ctc_linear = ctc_linear
+
+    def prepare_mask(self, mask):
+        mask_3d_btd = mask[:, :, None]
+        if len(mask.shape) == 2:
+            mask_4d_bhlt = 1 - mask[:, None, None, :]
+        elif len(mask.shape) == 3:
+            mask_4d_bhlt = 1 - mask[:, None, :]
+        mask_4d_bhlt = mask_4d_bhlt * -10000.0
+
+        return mask_3d_btd, mask_4d_bhlt
+
+    def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor, online: bool = False):
+        if not online:
+            speech = speech * self._output_size**0.5
+        batch_size, seq_len, feat_dim = speech.shape
+        # Create range [0, 1, 2, ..., seq_len-1] that's shape-dependent, not value-dependent
+        arange = torch.arange(seq_len, dtype=torch.int32, device=speech.device).unsqueeze(0).expand(batch_size, -1)
+        lengths_expanded = speech_lengths.unsqueeze(1).to(torch.int32)
+        # Mask where position < length (convert bool to float for prepare_mask)
+        mask = (arange < lengths_expanded).to(torch.float32)
+        mask = self.prepare_mask(mask)
+        if self.embed is None:
+            xs_pad = speech
+        else:
+            xs_pad = self.embed(speech)
+
+        encoder_outs = self.model.encoders0(xs_pad, mask)
+        xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+        encoder_outs = self.model.encoders(xs_pad, mask)
+        xs_pad, masks = encoder_outs[0], encoder_outs[1]
+
+        xs_pad = self.model.after_norm(xs_pad)
+
+        if self.ctc_linear is not None:
+            xs_pad = self.ctc_linear(xs_pad)
+            xs_pad = F.softmax(xs_pad, dim=2)
+
+        return xs_pad, speech_lengths
+
+#Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/sanm/positionwise_feed_forward.py#L12
+class PositionwiseFeedForwardDecoderSANM(torch.nn.Module):
+    """Positionwise feed forward layer.
+
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim, hidden_units, dropout_rate, adim=None, activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForwardDecoderSANM, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim if adim is None else adim, bias=False)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+        self.norm = LayerNorm(hidden_units)
+
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.norm(self.dropout(self.activation(self.w_1(x)))))
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L26 (MIT License)
+class DecoderLayerSANM(torch.nn.Module):
+    """Single decoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+
+    """
+
+    def __init__(
+        self,
+        size,
+        self_attn,
+        src_attn,
+        feed_forward,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+    ):
+        """Construct an DecoderLayer object."""
+        super(DecoderLayerSANM, self).__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(size)
+        if self_attn is not None:
+            self.norm2 = LayerNorm(size)
+        if src_attn is not None:
+            self.norm3 = LayerNorm(size)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear1 = torch.nn.Linear(size + size, size)
+            self.concat_linear2 = torch.nn.Linear(size + size, size)
+        self.reserve_attn = False
+        self.attn_mat = []
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L225 (MIT License)
+class ParaformerSANMDecoder(BaseTransformerDecoder):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
+    https://arxiv.org/abs/2006.01713
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        wo_input_layer: bool = False,
+        pos_enc_class=PositionalEncoding,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        att_layer_num: int = 6,
+        kernel_size: int = 21,
+        sanm_shfit: int = 0,
+        lora_list: List[str] = None,
+        lora_rank: int = 8,
+        lora_alpha: int = 16,
+        lora_dropout: float = 0.1,
+        chunk_multiply_factor: tuple = (1,),
+        tf2torch_tensor_name_prefix_torch: str = "decoder",
+        tf2torch_tensor_name_prefix_tf: str = "seq2seq/decoder",
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder_output_size,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            input_layer=input_layer,
+            use_output_layer=use_output_layer,
+            pos_enc_class=pos_enc_class,
+            normalize_before=normalize_before,
+        )
+
+        attention_dim = encoder_output_size
+
+        # wo_input_layer is now force to set to False
+        # input_layer is now force to set to "embed"
+        self.embed = torch.nn.Sequential(
+            torch.nn.Embedding(vocab_size, attention_dim),
+        )
+
+        self.normalize_before = normalize_before
+
+        # self.normalize_before is now force to set to True
+        self.after_norm = LayerNorm(attention_dim)
+        # use_output_layer is now force to set to True
+        self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+
+        self.att_layer_num = att_layer_num
+        self.num_blocks = num_blocks
+
+        self.decoders = repeat(
+            att_layer_num,
+            lambda lnum: DecoderLayerSANM(
+                attention_dim,
+                MultiHeadedAttentionSANMDecoder(
+                    attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+                ),
+                MultiHeadedAttentionCrossAtt(
+                    attention_heads,
+                    attention_dim,
+                    src_attention_dropout_rate,
+                    lora_list,
+                    lora_rank,
+                    lora_alpha,
+                    lora_dropout,
+                ),
+                PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+
+        # num_blocks - att_layer_num return 0
+        self.decoders2 = None
+
+        self.decoders3 = repeat(
+            1,
+            lambda lnum: DecoderLayerSANM(
+                attention_dim,
+                None,
+                None,
+                PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch
+        self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf
+        self.chunk_multiply_factor = chunk_multiply_factor
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L592 (MIT License)
+class DecoderLayerSANMExport(torch.nn.Module):
+
+    def __init__(self, model):
+        super().__init__()
+        self.self_attn = model.self_attn
+        self.src_attn = model.src_attn
+        self.feed_forward = model.feed_forward
+        self.norm1 = model.norm1
+        self.norm2 = model.norm2 if hasattr(model, "norm2") else None
+        self.norm3 = model.norm3 if hasattr(model, "norm3") else None
+        self.size = model.size
+
+    def forward(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
+
+        residual = tgt
+        tgt = self.norm1(tgt)
+        tgt = self.feed_forward(tgt)
+
+        x = tgt
+        if self.self_attn is not None:
+            tgt = self.norm2(tgt)
+            x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
+            x = residual + x
+
+        if self.src_attn is not None:
+            residual = x
+            x = self.norm3(x)
+            x = residual + self.src_attn(x, memory, memory_mask)
+
+        return x, tgt_mask, memory, memory_mask, cache
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/decoder.py#L641 (MIT License)
+class ParaformerSANMDecoderExport(torch.nn.Module):
+    def __init__(self, model, max_seq_len=512, model_name="decoder", onnx: bool = True, **kwargs):
+        super().__init__()
+
+        self.model = model
+
+        self.make_pad_mask = sequence_mask(max_seq_len, flip=False)
+
+        for i, d in enumerate(self.model.decoders):
+            if isinstance(d.self_attn, MultiHeadedAttentionSANMDecoder):
+                d.self_attn = MultiHeadedAttentionSANMDecoderExport(d.self_attn)
+            if isinstance(d.src_attn, MultiHeadedAttentionCrossAtt):
+                d.src_attn = MultiHeadedAttentionCrossAttExport(d.src_attn)
+            self.model.decoders[i] = DecoderLayerSANMExport(d)
+
+        if self.model.decoders2 is not None:
+            for i, d in enumerate(self.model.decoders2):
+                if isinstance(d.self_attn, MultiHeadedAttentionSANMDecoder):
+                    d.self_attn = MultiHeadedAttentionSANMDecoderExport(d.self_attn)
+                self.model.decoders2[i] = DecoderLayerSANMExport(d)
+
+        for i, d in enumerate(self.model.decoders3):
+            self.model.decoders3[i] = DecoderLayerSANMExport(d)
+
+        self.output_layer = model.output_layer
+        self.after_norm = model.after_norm
+        self.model_name = model_name
+
+    def prepare_mask(self, mask):
+        mask_3d_btd = mask[:, :, None]
+        if len(mask.shape) == 2:
+            mask_4d_bhlt = 1 - mask[:, None, None, :]
+        elif len(mask.shape) == 3:
+            mask_4d_bhlt = 1 - mask[:, None, :]
+        mask_4d_bhlt = mask_4d_bhlt * -10000.0
+
+        return mask_3d_btd, mask_4d_bhlt
+
+    def forward(
+        self,
+        hs_pad: torch.Tensor,
+        hlens: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        return_hidden: bool = False,
+        return_both: bool = False,
+    ):
+
+        tgt = ys_in_pad
+        batch_size = tgt.shape[0]
+        tgt_seq_len = tgt.shape[1]
+        arange_tgt = torch.arange(tgt_seq_len, dtype=torch.int32, device=tgt.device).unsqueeze(0).expand(batch_size, -1)
+        tgt_mask = (arange_tgt < ys_in_lens.unsqueeze(1).to(torch.int32)).to(torch.float32)
+        tgt_mask, _ = self.prepare_mask(tgt_mask)
+        # tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None]
+
+        memory = hs_pad
+        mem_seq_len = memory.shape[1]
+        arange_mem = torch.arange(mem_seq_len, dtype=torch.int32, device=memory.device).unsqueeze(0).expand(batch_size, -1)
+        memory_mask = (arange_mem < hlens.unsqueeze(1).to(torch.int32)).to(torch.float32)
+        _, memory_mask = self.prepare_mask(memory_mask)
+        # memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :]
+
+        x = tgt
+        x, tgt_mask, memory, memory_mask, _ = self.model.decoders(x, tgt_mask, memory, memory_mask)
+        if self.model.decoders2 is not None:
+            x, tgt_mask, memory, memory_mask, _ = self.model.decoders2(
+                x, tgt_mask, memory, memory_mask
+            )
+        x, tgt_mask, memory, memory_mask, _ = self.model.decoders3(x, tgt_mask, memory, memory_mask)
+        hidden = self.after_norm(x)
+        # x = self.output_layer(x)
+
+        if self.output_layer is not None and return_hidden is False:
+            x = self.output_layer(hidden)
+            return x, ys_in_lens
+        if return_both:
+            x = self.output_layer(hidden)
+            return x, hidden, ys_in_lens
+        return hidden, ys_in_lens
+
+# Modified from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/export_meta.py#L11 (MIT License)
+def export_rebuild_model(model, **kwargs):
+    model.device = kwargs.get("device")
+    is_onnx = kwargs.get("type", "onnx") == "onnx"
+    model.encoder = SANMEncoderExport(model.encoder, onnx=is_onnx)
+    model.predictor = CifPredictorV2Export(model.predictor, onnx=is_onnx)
+    model.decoder = ParaformerSANMDecoderExport(model.decoder, onnx=is_onnx)
+    model.make_pad_mask = sequence_mask(kwargs["max_seq_len"], flip=False)
+    model.forward = types.MethodType(export_forward, model)
+    model.export_dummy_inputs = types.MethodType(export_dummy_inputs, model)
+    model.export_input_names = types.MethodType(export_input_names, model)
+    model.export_output_names = types.MethodType(export_output_names, model)
+    model.export_dynamic_axes = types.MethodType(export_dynamic_axes, model)
+    model.export_name = types.MethodType(export_name, model)
+
+    # model.export_name = "model"
+    return model
+
+
+def export_forward(
+    self,
+    speech: torch.Tensor,
+    speech_lengths: torch.Tensor,
+):
+    # a. To device
+    batch = {"speech": speech, "speech_lengths": speech_lengths}
+    # batch = to_device(batch, device=self.device)
+
+    enc, enc_len = self.encoder(**batch)
+    mask = self.make_pad_mask(enc_len)[:, None, :]
+    pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(enc, mask)
+    pre_token_length = pre_token_length.floor().type(torch.int32)
+
+    decoder_out, _ = self.decoder(enc, enc_len, pre_acoustic_embeds, pre_token_length)
+    decoder_out = torch.log_softmax(decoder_out, dim=-1)
+    # sample_ids = decoder_out.argmax(dim=-1)
+
+    return decoder_out, pre_token_length
+
+
+def export_dummy_inputs(self):
+    speech = torch.randn(2, 30, 560)
+    speech_lengths = torch.tensor([6, 30], dtype=torch.int32)
+    return (speech, speech_lengths)
+
+
+def export_input_names(self):
+    return ["speech", "speech_lengths"]
+
+
+def export_output_names(self):
+    return ["logits", "token_num"]
+
+
+def export_dynamic_axes(self):
+    return {
+        "speech": {0: "batch_size", 1: "feats_length"},
+        "speech_lengths": {
+            0: "batch_size",
+        },
+        "logits": {0: "batch_size", 1: "logits_length"},
+        "token_num": {0: "batch_size"}
+    }
+
+
+def export_name(
+    self,
+):
+    return "model"
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/cif_predictor.py#L173 (MIT License)
+class CifPredictorV2(torch.nn.Module):
+    def __init__(
+        self,
+        idim,
+        l_order,
+        r_order,
+        threshold=1.0,
+        dropout=0.1,
+        smooth_factor=1.0,
+        noise_threshold=0,
+        tail_threshold=0.0,
+        tf2torch_tensor_name_prefix_torch="predictor",
+        tf2torch_tensor_name_prefix_tf="seq2seq/cif",
+        tail_mask=True,
+    ):
+        super().__init__()
+
+        self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0)
+        self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1)
+        self.cif_output = torch.nn.Linear(idim, 1)
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.threshold = threshold
+        self.smooth_factor = smooth_factor
+        self.noise_threshold = noise_threshold
+        self.tail_threshold = tail_threshold
+        self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch
+        self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf
+        self.tail_mask = tail_mask
+
+# Copied from https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/cif_predictor.py#L431 (MIT License)
+class CifPredictorV2Export(torch.nn.Module):
+    def __init__(self, model, **kwargs):
+        super().__init__()
+
+        self.pad = model.pad
+        self.cif_conv1d = model.cif_conv1d
+        self.cif_output = model.cif_output
+        self.threshold = model.threshold
+        self.smooth_factor = model.smooth_factor
+        self.noise_threshold = model.noise_threshold
+        self.tail_threshold = model.tail_threshold
+
+    def forward(
+        self,
+        hidden: torch.Tensor,
+        mask: torch.Tensor,
+    ):
+        alphas, token_num = self.forward_cnn(hidden, mask)
+        mask = mask.transpose(-1, -2).float()
+        mask = mask.squeeze(-1)
+        hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, mask=mask)
+        acoustic_embeds, cif_peak = cif_v1_export(hidden, alphas, self.threshold)
+
+        return acoustic_embeds, token_num, alphas, cif_peak
+
+    def forward_cnn(
+        self,
+        hidden: torch.Tensor,
+        mask: torch.Tensor,
+    ):
+        h = hidden
+        context = h.transpose(1, 2)
+        queries = self.pad(context)
+        output = torch.relu(self.cif_conv1d(queries))
+        output = output.transpose(1, 2)
+
+        output = self.cif_output(output)
+        alphas = torch.sigmoid(output)
+        alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+        mask = mask.transpose(-1, -2).float()
+        alphas = alphas * mask
+        alphas = alphas.squeeze(-1)
+        token_num = alphas.sum(-1)
+
+        return alphas, token_num
+
+    def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
+        b, t, d = hidden.size()
+        tail_threshold = self.tail_threshold
+
+        zeros_t = torch.zeros((b, 1), dtype=torch.float32, device=alphas.device)
+        ones_t = torch.ones_like(zeros_t)
+
+        mask_1 = torch.cat([mask, zeros_t], dim=1)
+        mask_2 = torch.cat([ones_t, mask], dim=1)
+        mask = mask_2 - mask_1
+        tail_threshold = mask * tail_threshold
+        alphas = torch.cat([alphas, zeros_t], dim=1)
+        alphas = torch.add(alphas, tail_threshold)
+
+        zeros = torch.zeros((b, 1, d), dtype=hidden.dtype).to(hidden.device)
+        hidden = torch.cat([hidden, zeros], dim=1)
+        token_num = alphas.sum(dim=-1)
+        token_num_floor = torch.floor(token_num)
+
+        return hidden, alphas, token_num_floor
+
+
+@torch.jit.script
+def cif_v1_export(hidden, alphas, threshold: float):
+    device = hidden.device
+    dtype = hidden.dtype
+    batch_size, len_time, hidden_size = hidden.size()
+    threshold = torch.tensor([threshold], dtype=alphas.dtype).to(alphas.device)
+
+    frames = torch.zeros(batch_size, len_time, hidden_size, dtype=dtype, device=device)
+    fires = torch.zeros(batch_size, len_time, dtype=dtype, device=device)
+
+    # prefix_sum = torch.cumsum(alphas, dim=1)
+    prefix_sum = torch.cumsum(alphas, dim=1, dtype=torch.float64).to(
+        torch.float32
+    )  # cumsum precision degradation cause wrong result in extreme
+    prefix_sum_floor = torch.floor(prefix_sum)
+    dislocation_prefix_sum = torch.roll(prefix_sum, 1, dims=1)
+    dislocation_prefix_sum_floor = torch.floor(dislocation_prefix_sum)
+
+    dislocation_prefix_sum_floor[:, 0] = 0
+    dislocation_diff = prefix_sum_floor - dislocation_prefix_sum_floor
+
+    fire_idxs = dislocation_diff > 0
+    fires[fire_idxs] = 1
+    fires = fires + prefix_sum - prefix_sum_floor
+
+    # prefix_sum_hidden = torch.cumsum(alphas.unsqueeze(-1).tile((1, 1, hidden_size)) * hidden, dim=1)
+    prefix_sum_hidden = torch.cumsum(alphas.unsqueeze(-1).repeat((1, 1, hidden_size)) * hidden, dim=1)
+    frames = prefix_sum_hidden[fire_idxs]
+    shift_frames = torch.roll(frames, 1, dims=0)
+
+    batch_len = fire_idxs.sum(1)
+    batch_idxs = torch.cumsum(batch_len, dim=0)
+    shift_batch_idxs = torch.roll(batch_idxs, 1, dims=0)
+    shift_batch_idxs[0] = 0
+    shift_frames[shift_batch_idxs] = 0
+
+    remains = fires - torch.floor(fires)
+    # remain_frames = remains[fire_idxs].unsqueeze(-1).tile((1, hidden_size)) * hidden[fire_idxs]
+    remain_frames = remains[fire_idxs].unsqueeze(-1).repeat((1, hidden_size)) * hidden[fire_idxs]
+
+    shift_remain_frames = torch.roll(remain_frames, 1, dims=0)
+    shift_remain_frames[shift_batch_idxs] = 0
+
+    frames = frames - shift_frames + shift_remain_frames - remain_frames
+
+    # max_label_len = batch_len.max()
+    max_label_len = alphas.sum(dim=-1)
+    max_label_len = torch.floor(max_label_len).max().to(dtype=torch.int32)
+
+    # frame_fires = torch.zeros(batch_size, max_label_len, hidden_size, dtype=dtype, device=device)
+    frame_fires = torch.zeros(batch_size, max_label_len, hidden_size, dtype=dtype, device=device)
+    indices = torch.arange(max_label_len, dtype=torch.int32, device=device).expand(batch_size, -1)
+    frame_fires_idxs = indices < batch_len.unsqueeze(1).to(torch.int32)
+    frame_fires[frame_fires_idxs] = frames
+    return frame_fires, fires
+
+# https://github.com/modelscope/FunASR/blob/main/funasr/models/paraformer/model.py#L30 (MIT License)
+class Paraformer(torch.nn.Module):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
+    https://arxiv.org/abs/2206.08317
+    """
+
+    def __init__(
+        self,
+        specaug: Optional[str] = None,
+        specaug_conf: Optional[Dict] = None,
+        normalize: str = None,
+        normalize_conf: Optional[Dict] = None,
+        encoder: str = None,
+        encoder_conf: Optional[Dict] = None,
+        decoder: str = None,
+        decoder_conf: Optional[Dict] = None,
+        ctc: str = None,
+        ctc_conf: Optional[Dict] = None,
+        predictor: str = None,
+        predictor_conf: Optional[Dict] = None,
+        ctc_weight: float = 0.5,
+        input_size: int = 80,
+        vocab_size: int = -1,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        # report_cer: bool = True,
+        # report_wer: bool = True,
+        # sym_space: str = "<space>",
+        # sym_blank: str = "<blank>",
+        # extract_feats_in_collect_stats: bool = True,
+        # predictor=None,
+        predictor_weight: float = 0.0,
+        predictor_bias: int = 0,
+        sampling_ratio: float = 0.2,
+        share_embedding: bool = False,
+        # preencoder: Optional[AbsPreEncoder] = None,
+        # postencoder: Optional[AbsPostEncoder] = None,
+        use_1st_decoder_loss: bool = False,
+        **kwargs,
+    ):
+
+        super().__init__()
+        # Filter out streaming-specific parameters not supported by SANMEncoder
+        sanm_encoder_params = {
+            'input_size', 'output_size', 'attention_heads', 'linear_units', 
+            'num_blocks', 'dropout_rate', 'positional_dropout_rate', 
+            'attention_dropout_rate', 'input_layer', 'pos_enc_class', 
+            'normalize_before', 'concat_after', 'positionwise_layer_type', 
+            'positionwise_conv_kernel_size', 'padding_idx', 'interctc_layer_idx',
+            'interctc_use_conditioning', 'kernel_size', 'sanm_shfit', 
+            'lora_list', 'lora_rank', 'lora_alpha', 'lora_dropout',
+            'selfattention_layer_type', 'tf2torch_tensor_name_prefix_torch',
+            'tf2torch_tensor_name_prefix_tf'
+        }
+        filtered_encoder_conf = {k: v for k, v in encoder_conf.items() if k in sanm_encoder_params}
+        encoder = SANMEncoder(input_size=input_size, **filtered_encoder_conf)
+        encoder_output_size = encoder.output_size()
+
+        if decoder is not None:
+            decoder = ParaformerSANMDecoder(
+                vocab_size=vocab_size,
+                encoder_output_size=encoder_output_size,
+                **decoder_conf,
+            )
+
+        if predictor is not None:
+            predictor = CifPredictorV2(**predictor_conf)
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self.predictor = predictor
+
+    def export(self, **kwargs):
+
+        if "max_seq_len" not in kwargs:
+            kwargs["max_seq_len"] = 512
+        models = export_rebuild_model(model=self, **kwargs)
+        return models
+
+def add_file_root_path(model_or_path: str, file_path_metas: dict, cfg={}):
+
+    if isinstance(file_path_metas, dict):
+        if isinstance(cfg, list):
+            cfg.append({})
+
+        for k, v in file_path_metas.items():
+            if isinstance(v, str):
+                p = os.path.join(model_or_path, v)
+                if os.path.exists(p):
+                    if isinstance(cfg, dict):
+                        cfg[k] = p
+                    elif isinstance(cfg, list):
+                        # if len(cfg) == 0:
+                        # cfg.append({})
+                        cfg[-1][k] = p
+
+            elif isinstance(v, dict):
+                if isinstance(cfg, dict):
+                    if k not in cfg:
+                        cfg[k] = {}
+                    add_file_root_path(model_or_path, v, cfg[k])
+                # elif isinstance(cfg, list):
+                #     cfg.append({})
+                #     add_file_root_path(model_or_path, v, cfg)
+            elif isinstance(v, (list, tuple)):
+                for i, vv in enumerate(v):
+                    if k not in cfg:
+                        cfg[k] = []
+                    if isinstance(vv, str):
+                        p = os.path.join(model_or_path, vv)
+                        # file_path_metas[i] = p
+                        if os.path.exists(p):
+                            if isinstance(cfg[k], dict):
+                                cfg[k] = p
+                            elif isinstance(cfg[k], list):
+                                cfg[k].append(p)
+                    elif isinstance(vv, dict):
+                        add_file_root_path(model_or_path, vv, cfg[k])
+
+    return cfg
+
+def get_or_download_model_dir_hf(
+    model,
+    model_revision=None,
+    is_training=False,
+    check_latest=True,
+):
+    """Get local model directory or download model if necessary.
+
+    Args:
+        model (str): model id or path to local model directory.
+        model_revision  (str, optional): model version number.
+        :param is_training:
+    """
+    from huggingface_hub import snapshot_download
+
+    model_cache_dir = snapshot_download(model)
+    return model_cache_dir
+
+def download_from_hf(**kwargs):
+    model_or_path = kwargs.get("model")
+    model_revision = kwargs.get("model_revision", "master")
+    if not os.path.exists(model_or_path) and "model_path" not in kwargs:
+        try:
+            model_or_path = get_or_download_model_dir_hf(
+                model_or_path,
+                model_revision,
+                is_training=kwargs.get("is_training"),
+                check_latest=kwargs.get("check_latest", True),
+                )
+        except Exception as e:
+            print(f"Download: {model_or_path} failed!: {e}")
+
+    kwargs["model_path"] = model_or_path if "model_path" not in kwargs else kwargs["model_path"]
+
+    if os.path.exists(os.path.join(model_or_path, "configuration.json")):
+        with open(os.path.join(model_or_path, "configuration.json"), "r", encoding="utf-8") as f:
+            conf_json = json.load(f)
+            cfg = {}
+            if "file_path_metas" in conf_json:
+                add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
+            cfg.update(kwargs)
+            if "config" in cfg:
+                config = OmegaConf.load(cfg["config"])
+                kwargs = OmegaConf.merge(config, cfg)
+                kwargs["model"] = config["model"]
+    elif os.path.exists(os.path.join(model_or_path, "config.yaml")):
+        config = OmegaConf.load(os.path.join(model_or_path, "config.yaml"))
+        kwargs = OmegaConf.merge(config, kwargs)
+        init_param = os.path.join(model_or_path, "model.pt")
+        if "init_param" not in kwargs or not os.path.exists(kwargs["init_param"]):
+            kwargs["init_param"] = init_param
+            assert os.path.exists(kwargs["init_param"]), "init_param does not exist"
+        if os.path.exists(os.path.join(model_or_path, "tokens.json")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(model_or_path, "tokens.json")
+        if os.path.exists(os.path.join(model_or_path, "seg_dict")):
+            kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(model_or_path, "seg_dict")
+        kwargs["model"] = config["model"]
+        if os.path.exists(os.path.join(model_or_path, "am.mvn")):
+            kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")
+    if isinstance(kwargs, DictConfig):
+        kwargs = OmegaConf.to_container(kwargs, resolve=True)
+
+    return kwargs
+
+def deep_update(original, update):
+    for key, value in update.items():
+        if isinstance(value, dict) and key in original:
+            if len(value) == 0:
+                original[key] = value
+            deep_update(original[key], value)
+        else:
+            original[key] = value
+
+def load_pretrained_model(
+    path: str,
+    model: torch.nn.Module,
+    ignore_init_mismatch: bool = True,
+    map_location: str = "cpu",
+    oss_bucket=None,
+    scope_map=[],
+    excludes=None,
+    **kwargs,
+):
+    """Load a model state and set it to the model.
+
+    Args:
+            init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>
+
+    Examples:
+
+    """
+
+    obj = model
+    dst_state = obj.state_dict()
+    ori_state = torch.load(path, map_location=map_location)
+
+    src_state = copy.deepcopy(ori_state)
+    src_state = src_state["state_dict"] if "state_dict" in src_state else src_state
+    src_state = src_state["model_state_dict"] if "model_state_dict" in src_state else src_state
+    src_state = src_state["model"] if "model" in src_state else src_state
+
+    if isinstance(scope_map, str):
+        scope_map = scope_map.split(",")
+    scope_map += ["module.", "None"]
+    logging.info(f"scope_map: {scope_map}")
+
+    for k in dst_state.keys():
+        excludes_flag = False
+        if excludes is not None:
+            for k_ex in excludes:
+                if k.startswith(k_ex):
+                    logging.info(f"key: {k} matching: {k_ex}, excluded")
+                    excludes_flag = True
+                    break
+        if excludes_flag:
+            continue
+
+        k_src = k
+
+        if scope_map is not None:
+            src_prefix = ""
+            dst_prefix = ""
+            for i in range(0, len(scope_map), 2):
+                src_prefix = scope_map[i] if scope_map[i].lower() != "none" else ""
+                dst_prefix = scope_map[i + 1] if scope_map[i + 1].lower() != "none" else ""
+
+                if dst_prefix == "" and (src_prefix + k) in src_state.keys():
+                    k_src = src_prefix + k
+                    if not k_src.startswith("module."):
+                        logging.info(f"init param, map: {k} from {k_src} in ckpt")
+                elif (
+                    k.startswith(dst_prefix)
+                    and k.replace(dst_prefix, src_prefix, 1) in src_state.keys()
+                ):
+                    k_src = k.replace(dst_prefix, src_prefix, 1)
+                    if not k_src.startswith("module."):
+                        logging.info(f"init param, map: {k} from {k_src} in ckpt")
+
+        if k_src in src_state.keys():
+            if ignore_init_mismatch and dst_state[k].shape != src_state[k_src].shape:
+                logging.info(
+                    f"ignore_init_mismatch:{ignore_init_mismatch}, dst: {k, dst_state[k].shape}, src: {k_src, src_state[k_src].shape}"
+                )
+            else:
+                dst_state[k] = src_state[k_src]
+        else:
+            print(f"Warning, miss key in ckpt: {k}, {path}")
+
+    flag = obj.load_state_dict(dst_state, strict=True)
+
+def _torchscripts(model, path, device="cuda"):
+    dummy_input = model.export_dummy_inputs()
+    model_jit_script = torch.jit.trace(model, dummy_input)
+    return model_jit_script
+
+def export_utils(
+    model, data_in=None, quantize: bool = False, opset_version: int = 14, type="onnx", **kwargs
+):
+    model_scripts = model.export(**kwargs)
+    export_dir = kwargs.get("output_dir", os.path.dirname(kwargs.get("init_param")))
+    os.makedirs(export_dir, exist_ok=True)
+
+    if not isinstance(model_scripts, (list, tuple)):
+        model_scripts = (model_scripts,)
+    for m in model_scripts:
+        m.eval()
+        device = "cpu"    
+        print("Exporting torchscripts on device {}".format(device))
+        model_jit_scripts = _torchscripts(m, path=export_dir, device=device)
+
+    return export_dir, model_jit_scripts
+
+def download_model(**kwargs):
+    kwargs = download_from_hf(**kwargs)
+    return kwargs
+
+def build_model(**kwargs):
+    assert "model" in kwargs
+    kwargs = download_model(**kwargs)
+    torch.set_num_threads(kwargs.get("ncpu", 4))
+
+    # build tokenizer
+    # Here to remove building tokenizer to get vocab_size. Currently hard_code the value here
+    # Check the downloaded token.json and the vocab_size is the token number in token.json
+    kwargs["vocab_size"] = 8404
+
+    # build model
+    model_conf = {}
+    deep_update(model_conf, kwargs.get("model_conf", {}))
+    deep_update(model_conf, kwargs)
+    model = Paraformer(**model_conf)
+
+    # init_param
+    init_param = kwargs.get("init_param", None)
+    if init_param is not None:
+        if os.path.exists(init_param):
+            logging.info(f"Loading pretrained params from {init_param}")
+            load_pretrained_model(
+                model=model,
+                path=init_param,
+                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
+                oss_bucket=kwargs.get("oss_bucket", None),
+                scope_map=kwargs.get("scope_map", []),
+                excludes=kwargs.get("excludes", None),
+            )
+        else:
+            print(f"error, init_param does not exist!: {init_param}")
+
+    # fp16
+    if kwargs.get("fp16", False):
+        model.to(torch.float16)
+    elif kwargs.get("bf16", False):
+        model.to(torch.bfloat16)
+    model.to(kwargs["device"])
+
+    return model, kwargs
+
+def export(model, kwargs, input=None, **cfg):
+    del kwargs["model"]
+    model.eval()
+
+    with torch.no_grad():
+        export_dir, model_jit_scripts = export_utils(model=model, **kwargs)
+
+    return export_dir, model_jit_scripts
+
+
+# ============================================================================
+# Transformers-compatible wrappers for Paraformer models
+# ============================================================================
+# These classes provide compatibility with the optimum-intel export pipeline
+
+try:
+    from transformers import PretrainedConfig, PreTrainedModel
+    _TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    _TRANSFORMERS_AVAILABLE = False
+    PretrainedConfig = object
+    PreTrainedModel = object
+
+
+class ParaformerConfig(PretrainedConfig):
+    """
+    Configuration class for Paraformer ASR models.
+    
+    This provides a transformers-compatible configuration for FunASR Paraformer models.
+    """
+    model_type = "paraformer"
+    
+    def __init__(
+        self,
+        vocab_size: int = 8404,
+        encoder_dim: int = 512,
+        attention_heads: int = 4,
+        encoder_layers: int = 50,
+        decoder_layers: int = 16,
+        max_seq_len: int = 512,
+        frontend_conf: Optional[Dict] = None,
+        **kwargs
+    ):
+        if _TRANSFORMERS_AVAILABLE:
+            super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.encoder_dim = encoder_dim
+        self.attention_heads = attention_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.max_seq_len = max_seq_len
+        self.frontend_conf = frontend_conf or {}
+    
+    @classmethod
+    def from_funasr_config(cls, config_path: Union[str, Path]) -> "ParaformerConfig":
+        """Load configuration from FunASR config.yaml file."""
+        try:
+            if _OMEGACONF_AVAILABLE:
+                config = OmegaConf.load(config_path)
+                
+                return cls(
+                    vocab_size=config.get("vocab_size", 8404),
+                    encoder_dim=config.get("encoder_conf", {}).get("output_size", 512),
+                    attention_heads=config.get("encoder_conf", {}).get("attention_heads", 4),
+                    encoder_layers=config.get("encoder_conf", {}).get("num_blocks", 50),
+                    decoder_layers=config.get("decoder_conf", {}).get("num_blocks", 16),
+                    max_seq_len=config.get("max_seq_len", 512),
+                    frontend_conf=dict(config.get("frontend_conf", {})),
+                )
+        except Exception as e:
+            logging.warning(f"Could not load FunASR config: {e}, using defaults")
+        return cls()
+
+
+class ParaformerForASR(PreTrainedModel):
+    """
+    Transformers-compatible wrapper for Paraformer ASR models.
+    
+    This class wraps FunASR Paraformer models to make them compatible with
+    the optimum-intel export pipeline.
+    """
+    if _TRANSFORMERS_AVAILABLE:
+        config_class = ParaformerConfig
+    base_model_prefix = "paraformer"
+    main_input_name = "speech"
+    
+    def __init__(self, config: ParaformerConfig, funasr_model=None):
+        if _TRANSFORMERS_AVAILABLE:
+            super().__init__(config)
+        self.config = config
+        self.funasr_model = funasr_model
+        self._jit_model = None
+        self._model_path = None
+        self._model_kwargs = {}
+    
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path: Union[str, Path],
+        *model_args,
+        cache_dir: Optional[str] = None,
+        **kwargs
+    ) -> "ParaformerForASR":
+        """
+        Load a Paraformer model from a FunASR model directory or HuggingFace Hub.
+        """
+        from huggingface_hub import snapshot_download
+        
+        model_path = Path(model_name_or_path)
+        
+        # Download from HuggingFace Hub if not a local path
+        if not model_path.exists():
+            logging.info(f"Downloading Paraformer model from HuggingFace Hub: {model_name_or_path}")
+            model_path = Path(snapshot_download(
+                repo_id=str(model_name_or_path),
+                cache_dir=cache_dir,
+                token=kwargs.get("token"),
+                revision=kwargs.get("revision", "main"),
+            ))
+        
+        # Load config
+        config_yaml_path = model_path / "config.yaml"
+        if config_yaml_path.exists():
+            config = ParaformerConfig.from_funasr_config(config_yaml_path)
+        else:
+            config = ParaformerConfig()
+        
+        # Load the FunASR model
+        device = kwargs.get("device", "cpu")
+        funasr_model, model_kwargs = build_model(model=str(model_path), device=device)
+        
+        instance = cls(config, funasr_model=funasr_model)
+        instance._model_path = model_path
+        instance._model_kwargs = model_kwargs
+        
+        return instance
+    
+    def get_jit_model(self) -> torch.jit.ScriptModule:
+        """Get or create the TorchScript model for export."""
+        if self._jit_model is None:
+            _, self._jit_model = export(
+                self.funasr_model,
+                self._model_kwargs,
+                type="torchscript",
+                quantize=False,
+                device=str(self._model_kwargs.get("device", "cpu"))
+            )
+        return self._jit_model
+    
+    def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor):
+        """Forward pass through the model."""
+        if self.funasr_model is not None:
+            return self.funasr_model(speech, speech_lengths)
+        raise ValueError("FunASR model not loaded")
+
+
+def _load_paraformer_model(
+    model_name_or_path: str,
+    subfolder: str = "",
+    revision: str = "main",
+    cache_dir: str = None,
+    token: Optional[str] = None,
+    trust_remote_code: bool = False,
+    **kwargs,
+):
+    """Load a Paraformer model for export (TasksManager compatible loader)."""
+    return ParaformerForASR.from_pretrained(
+        model_name_or_path,
+        cache_dir=cache_dir,
+        token=token,
+        revision=revision,
+        **kwargs,
+    )
diff --git a/optimum/exporters/openvino/paraformer_plugin.py b/optimum/exporters/openvino/paraformer_plugin.py
new file mode 100755
index 0000000000..dbb8c55d78
--- /dev/null
+++ b/optimum/exporters/openvino/paraformer_plugin.py
@@ -0,0 +1,610 @@
+#  Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+Paraformer Plugin for OpenVINO Export
+
+This module provides automatic Paraformer model support for optimum-cli export
+by hooking into the main_export function.
+
+Usage:
+    optimum-cli export openvino --model funasr/paraformer-zh --task automatic-speech-recognition output_dir
+    
+    # With INT8 quantization:
+    optimum-cli export openvino --model funasr/paraformer-zh --task automatic-speech-recognition --weight-format int8 output_dir
+"""
+
+import logging
+from functools import wraps
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import torch
+from transformers import PretrainedConfig, PreTrainedModel
+
+from optimum.exporters.tasks import TasksManager
+from optimum.exporters.onnx.config import OnnxConfig
+
+logger = logging.getLogger(__name__)
+
+
+class ParaformerConfig(PretrainedConfig):
+    """
+    Configuration class for Paraformer ASR models.
+    
+    This provides a transformers-compatible configuration for FunASR Paraformer models.
+    """
+    model_type = "paraformer"
+    
+    def __init__(
+        self,
+        vocab_size: int = 8404,
+        encoder_dim: int = 512,
+        attention_heads: int = 4,
+        encoder_layers: int = 50,
+        decoder_layers: int = 16,
+        frontend_conf: Optional[Dict] = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.encoder_dim = encoder_dim
+        self.attention_heads = attention_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.frontend_conf = frontend_conf or {}
+    
+    @classmethod
+    def from_funasr_config(cls, config_path: Union[str, Path]) -> "ParaformerConfig":
+        """Load configuration from FunASR config.yaml file."""
+        try:
+            from omegaconf import OmegaConf
+            config = OmegaConf.load(config_path)
+            
+            return cls(
+                vocab_size=config.get("vocab_size", 8404),
+                encoder_dim=config.get("encoder_conf", {}).get("output_size", 512),
+                attention_heads=config.get("encoder_conf", {}).get("attention_heads", 4),
+                encoder_layers=config.get("encoder_conf", {}).get("num_blocks", 50),
+                decoder_layers=config.get("decoder_conf", {}).get("num_blocks", 16),
+                frontend_conf=dict(config.get("frontend_conf", {})),
+            )
+        except Exception as e:
+            logger.warning(f"Could not load FunASR config: {e}, using defaults")
+            return cls()
+
+
+class ParaformerForASR(PreTrainedModel):
+    """
+    Transformers-compatible wrapper for Paraformer ASR models.
+    
+    This class wraps FunASR Paraformer models to make them compatible with
+    the optimum-intel export pipeline.
+    """
+    config_class = ParaformerConfig
+    base_model_prefix = "paraformer"
+    main_input_name = "speech"
+    
+    def __init__(self, config: ParaformerConfig, funasr_model=None):
+        super().__init__(config)
+        self.funasr_model = funasr_model
+        self._jit_model = None
+    
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path: Union[str, Path],
+        *model_args,
+        cache_dir: Optional[str] = None,
+        **kwargs
+    ) -> "ParaformerForASR":
+        """
+        Load a Paraformer model from a FunASR model directory or HuggingFace Hub.
+        """
+        from huggingface_hub import snapshot_download
+        
+        model_path = Path(model_name_or_path)
+        
+        # Download from HuggingFace Hub if not a local path
+        if not model_path.exists():
+            logger.info(f"Downloading Paraformer model from HuggingFace Hub: {model_name_or_path}")
+            model_path = Path(snapshot_download(
+                repo_id=str(model_name_or_path),
+                cache_dir=cache_dir,
+                token=kwargs.get("token"),
+                revision=kwargs.get("revision", "main"),
+            ))
+        
+        # Load config
+        config_yaml_path = model_path / "config.yaml"
+        if config_yaml_path.exists():
+            config = ParaformerConfig.from_funasr_config(config_yaml_path)
+        else:
+            config = ParaformerConfig()
+        
+        # Load the FunASR model
+        from optimum.exporters.openvino.modeling_paraformer import build_model
+        
+        device = kwargs.get("device", "cpu")
+        funasr_model, model_kwargs = build_model(model=str(model_path), device=device)
+        
+        instance = cls(config, funasr_model=funasr_model)
+        instance._model_path = model_path
+        instance._model_kwargs = model_kwargs
+        
+        return instance
+    
+    def get_jit_model(self) -> torch.jit.ScriptModule:
+        """Get or create the TorchScript model for export."""
+        if self._jit_model is None:
+            from optimum.exporters.openvino.modeling_paraformer import export
+            
+            _, self._jit_model = export(
+                self.funasr_model,
+                self._model_kwargs,
+                type="torchscript",
+                quantize=False,
+                device=str(self._model_kwargs.get("device", "cpu"))
+            )
+        return self._jit_model
+    
+    def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor):
+        """Forward pass through the model."""
+        if self.funasr_model is not None:
+            return self.funasr_model(speech, speech_lengths)
+        raise ValueError("FunASR model not loaded")
+
+
+class ParaformerOnnxConfig(OnnxConfig):
+    """
+    ONNX/OpenVINO export configuration for Paraformer models.
+    """
+    NORMALIZED_CONFIG_CLASS = ParaformerConfig
+    DEFAULT_ONNX_OPSET = 14
+    
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "speech": {0: "batch_size", 1: "sequence_length", 2: "feature_dim"},
+            "speech_lengths": {0: "batch_size"},
+        }
+    
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "logits": {0: "batch_size", 1: "sequence_length"},
+        }
+    
+    def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for export."""
+        batch_size = 1
+        sequence_length = 1000  # ~10 seconds of audio at 16kHz with 10ms frame shift
+        feature_dim = 560  # LFR features (80 mel * 7 frames)
+        
+        return {
+            "speech": torch.randn(batch_size, sequence_length, feature_dim),
+            "speech_lengths": torch.tensor([sequence_length], dtype=torch.int32),
+        }
+
+
+def _is_paraformer_model(model_name_or_path: str, cache_dir: str = None, **kwargs) -> bool:
+    """Check if the model is a Paraformer ASR model."""
+    import json
+    from pathlib import Path
+    
+    try:
+        from huggingface_hub import HfFileSystem
+        
+        model_path = Path(model_name_or_path)
+        
+        # Check if it's a local path
+        if model_path.exists():
+            if (model_path / "config.yaml").exists() and (model_path / "tokens.json").exists():
+                return True
+            if (model_path / "am.mvn").exists():
+                return True
+            return False
+        
+        # Check HuggingFace Hub
+        fs = HfFileSystem(token=kwargs.get("token"))
+        try:
+            repo_files = fs.ls(f"{model_name_or_path}", detail=False)
+            repo_files = [f.split("/")[-1] for f in repo_files]
+            
+            if "config.yaml" in repo_files and "tokens.json" in repo_files:
+                return True
+            if "am.mvn" in repo_files:
+                return True
+        except Exception:
+            pass
+        
+        return False
+    except Exception:
+        return False
+
+
+def export_paraformer_to_openvino(
+    model_name_or_path: str,
+    output: Union[str, Path],
+    weight_format: str = "fp16",
+    cache_dir: str = None,
+    token: Optional[str] = None,
+    ov_config: Any = None,
+    **kwargs
+) -> None:
+    """
+    Export a Paraformer model to OpenVINO format.
+    
+    This function handles the complete export pipeline for FunASR Paraformer models,
+    including full INT8 quantization with calibration data when requested.
+    """
+    import os
+    import openvino as ov
+    import shutil
+    import numpy as np
+    from optimum.exporters.openvino.modeling_paraformer import build_model, export
+    from huggingface_hub import snapshot_download
+    
+    model_path = Path(model_name_or_path)
+    output_path = Path(output)
+    
+    # Download from HuggingFace Hub if not a local path
+    if not model_path.exists():
+        logger.info(f"Downloading Paraformer model from HuggingFace Hub: {model_name_or_path}")
+        model_path = Path(snapshot_download(
+            repo_id=str(model_name_or_path),
+            cache_dir=cache_dir,
+            token=token,
+        ))
+    
+    logger.info(f"Loading Paraformer model from {model_path}")
+    
+    # Build the FunASR model
+    device = kwargs.get("device", "cpu")
+    model, model_kwargs = build_model(model=str(model_path), device=device)
+    
+    # Export to TorchScript
+    logger.info("Converting to TorchScript...")
+    model_dir, jit_model = export(model, model_kwargs, type="torchscript", quantize=False, device=device)
+    
+    # Convert to OpenVINO
+    logger.info("Converting to OpenVINO format...")
+    ovm = ov.convert_model(jit_model, input=[([-1, -1, -1], torch.float32), ([-1], torch.int32)])
+    
+    # Create output directory with ov_models subdirectory (matching optimum-intel structure)
+    ov_models_path = output_path / "ov_models"
+    ov_models_path.mkdir(parents=True, exist_ok=True)
+    output_model_path = ov_models_path / "openvino_model.xml"
+    
+    # Check if full INT8 quantization is requested (via ov_config with quantization_config)
+    apply_full_quant = False
+    dataset_name = None
+    num_samples = 50
+    sym = False
+    
+    if ov_config is not None:
+        q_config = getattr(ov_config, "quantization_config", None)
+        if q_config is not None:
+            # Import configuration classes
+            try:
+                from optimum.intel.openvino.configuration import OVQuantizationConfig, OVWeightQuantizationConfig
+                
+                # Handle OVQuantizationConfig (from --quant-mode int8)
+                if isinstance(q_config, OVQuantizationConfig):
+                    dtype = getattr(q_config, 'dtype', None)
+                    dataset_name = getattr(q_config, 'dataset', None)
+                    
+                    if dtype == 'int8' and dataset_name is not None:
+                        apply_full_quant = True
+                        num_samples = getattr(q_config, 'num_samples', 50) or 50
+                        sym = getattr(q_config, 'sym', False)
+                        logger.info(f"Full INT8 quantization requested with dataset={dataset_name}")
+                
+                # Handle OVWeightQuantizationConfig (from --weight-format int8)
+                elif isinstance(q_config, OVWeightQuantizationConfig):
+                    apply_full_quant = False
+                    weight_format = "int8"
+                
+                # Handle dict config (fallback)
+                elif isinstance(q_config, dict):
+                    if q_config.get('dtype') == 'int8' and 'dataset' in q_config:
+                        apply_full_quant = True
+                        dataset_name = q_config.get('dataset')
+                        num_samples = q_config.get('num_samples', 50) or 50
+                        sym = q_config.get('sym', False)
+                        logger.info(f"Full INT8 quantization requested with dataset={dataset_name}")
+            except ImportError as e:
+                logger.warning(f"Could not import configuration classes: {e}")
+    
+    if apply_full_quant:
+        logger.info("Applying full INT8 quantization (weights + activations) for Paraformer...")
+        import nncf
+        import librosa
+        
+        # Helper function to extract paraformer features
+        def extract_paraformer_features(audio_path):
+            """Extract LFR features from audio for paraformer."""
+            audio, sr = librosa.load(audio_path, sr=16000)
+            mel_spec = librosa.feature.melspectrogram(
+                y=audio, sr=sr, n_fft=512, hop_length=160,
+                win_length=400, n_mels=80, fmin=0, fmax=8000, power=2.0
+            )
+            log_mel = np.log(np.maximum(mel_spec, 1e-10)).T
+            log_mel = (log_mel - np.mean(log_mel, axis=0)) / (np.std(log_mel, axis=0) + 1e-10)
+            T = log_mel.shape[0]
+            pad_len = (6 - (T % 6)) % 6
+            if pad_len > 0:
+                log_mel = np.pad(log_mel, ((0, pad_len), (0, 0)), mode='edge')
+            T_lfr = log_mel.shape[0] // 6
+            lfr_features = []
+            for i in range(T_lfr):
+                frames = [log_mel[min(i * 6 + j, log_mel.shape[0] - 1)] for j in range(7)]
+                lfr_features.append(np.concatenate(frames))
+            return np.array(lfr_features, dtype=np.float32)
+        
+        # Generate calibration dataset
+        calibration_samples = []
+        
+        if dataset_name and ('aishell' in dataset_name.lower()):
+            # Use AISHELL-style calibration with example audio
+            example_audio = os.path.join(model_dir, "example", "asr_example.wav")
+            
+            if not os.path.exists(example_audio):
+                raise ValueError(
+                    f"AISHELL calibration requires example audio at {example_audio}. "
+                    "File not found. Please ensure the model was downloaded correctly."
+                )
+            
+            logger.info(f"Generating {num_samples} calibration samples from AISHELL audio...")
+            base_features = extract_paraformer_features(example_audio)
+            
+            # Generate diverse calibration samples with noise augmentation
+            np.random.seed(42)
+            for i in range(num_samples):
+                noise = np.random.randn(*base_features.shape).astype(np.float32) * (0.01 + i * 0.0004)
+                features = base_features + noise
+                
+                speech = features[np.newaxis, :].astype(np.float32)
+                speech_lengths = np.array([features.shape[0]], dtype=np.int32)
+                # Use 'speech.1' as the model input name (from OV conversion)
+                calibration_samples.append({'speech.1': speech, 'speech_lengths': speech_lengths})
+        else:
+            raise ValueError(
+                f"Unknown dataset '{dataset_name}' for paraformer quantization. "
+                "Please use 'aishell-1' for AISHELL-style calibration."
+            )
+        
+        # Create NNCF calibration dataset
+        calibration_dataset = nncf.Dataset(calibration_samples)
+        
+        # Set quantization preset based on sym flag
+        preset = nncf.QuantizationPreset.PERFORMANCE if sym else nncf.QuantizationPreset.MIXED
+        
+        # Get smooth_quant_alpha if available
+        smooth_quant_alpha = None
+        if ov_config is not None:
+            q_config = getattr(ov_config, "quantization_config", None)
+            if q_config is not None:
+                try:
+                    from optimum.intel.openvino.configuration import OVQuantizationConfig
+                    if isinstance(q_config, OVQuantizationConfig):
+                        smooth_quant_alpha = getattr(q_config, 'smooth_quant_alpha', None)
+                except ImportError:
+                    pass
+        
+        logger.info(f"Applying nncf.quantize() for full INT8 quantization...")
+        
+        # Build kwargs for nncf.quantize with per-tensor quantization for dynamic shape support
+        from nncf.quantization.advanced_parameters import AdvancedQuantizationParameters, QuantizationParameters
+        
+        quant_kwargs = {
+            'subset_size': num_samples,
+            'model_type': nncf.ModelType.TRANSFORMER,
+            'preset': preset,
+            'advanced_parameters': AdvancedQuantizationParameters(
+                # Use per-tensor quantization for activations to avoid shape-specific constants
+                activations_quantization_params=QuantizationParameters(per_channel=False),
+            )
+        }
+        
+        # Add smooth_quant_alpha if set
+        if smooth_quant_alpha is not None and smooth_quant_alpha != -1:
+            from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
+            quant_kwargs['advanced_parameters'] = AdvancedQuantizationParameters(
+                activations_quantization_params=QuantizationParameters(per_channel=False),
+                smooth_quant_alphas=AdvancedSmoothQuantParameters(
+                    matmul=smooth_quant_alpha,
+                    convolution=smooth_quant_alpha
+                )
+            )
+        
+        ovm = nncf.quantize(ovm, calibration_dataset, **quant_kwargs)
+        logger.info("Full INT8 quantization complete.")
+        
+        # Save with FP16 compression
+        ov.save_model(ovm, str(output_model_path), compress_to_fp16=True)
+        
+    elif weight_format.lower() == "int8":
+        # Weight-only INT8 compression (from --weight-format int8)
+        logger.info("Applying INT8 weight compression...")
+        try:
+            import nncf
+            ovm = nncf.compress_weights(ovm, mode=nncf.CompressWeightsMode.INT8_SYM)
+        except ImportError:
+            logger.warning("NNCF not available, saving without INT8 compression")
+        
+        ov.save_model(ovm, str(output_model_path), compress_to_fp16=True)
+    else:
+        # No quantization - just serialize the model
+        logger.info(f"Saving model to {output_model_path}")
+        ov.serialize(ovm, str(output_model_path))
+    
+    # Copy auxiliary files to ov_models directory
+    for aux_file in ["tokens.json", "config.yaml", "configuration.json", "am.mvn", "seg_dict"]:
+        src = model_path / aux_file
+        if src.exists():
+            shutil.copy(src, ov_models_path / aux_file)
+    
+    logger.info(f"Paraformer model exported successfully to {output_path}")
+
+
+def _load_paraformer_model(
+    model_name_or_path: str,
+    subfolder: str = "",
+    revision: str = "main",
+    cache_dir: str = None,
+    token: Optional[str] = None,
+    trust_remote_code: bool = False,
+    **kwargs,
+):
+    """Load a Paraformer model for export."""
+    return ParaformerForASR.from_pretrained(
+        model_name_or_path,
+        cache_dir=cache_dir,
+        token=token,
+        revision=revision,
+        **kwargs,
+    )
+
+
+def register_paraformer_with_tasks_manager():
+    """
+    Register Paraformer support with TasksManager.
+    
+    This function adds the necessary mappings for Paraformer to work
+    with the standard optimum-intel export pipeline.
+    """
+    # Register paraformer library with supported model types
+    if "paraformer" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES:
+        TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["paraformer"] = {
+            "paraformer": {
+                "automatic-speech-recognition": ("ParaformerForASR",),
+            }
+        }
+    
+    # Register model loader for paraformer library
+    if "paraformer" not in TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP:
+        TasksManager._LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP["paraformer"] = {
+            "automatic-speech-recognition": _load_paraformer_model,
+        }
+    
+    logger.debug("Registered Paraformer support with TasksManager")
+
+
+def patch_main_quantize():
+    """
+    Patch the _main_quantize function to skip Paraformer models
+    (since quantization is already handled in main_export).
+    """
+    try:
+        from optimum.exporters.openvino import __main__ as ov_main
+        
+        original_main_quantize = ov_main._main_quantize
+        
+        @wraps(original_main_quantize)
+        def patched_main_quantize(
+            model_name_or_path: str,
+            **kwargs
+        ):
+            # Debug logging
+            logger.info(f"patched_main_quantize called for model: {model_name_or_path}")
+            
+            # Check if this is a Paraformer model
+            cache_dir = kwargs.get("cache_dir")
+            is_paraformer = _is_paraformer_model(model_name_or_path, cache_dir=cache_dir)
+            logger.info(f"Is Paraformer model: {is_paraformer}")
+            
+            if is_paraformer:
+                logger.info("Skipping _main_quantize for Paraformer (already quantized in main_export)")
+                # For Paraformer, quantization is already done in main_export, so just return
+                return
+            
+            # Not a Paraformer model, use original quantization
+            return original_main_quantize(model_name_or_path, **kwargs)
+        
+        # Apply the patch
+        ov_main._main_quantize = patched_main_quantize
+        logger.debug("Patched _main_quantize to skip Paraformer models")
+        
+    except Exception as e:
+        logger.warning(f"Could not patch _main_quantize for Paraformer support: {e}")
+
+
+def patch_main_export():
+    """
+    Patch the main_export function to handle Paraformer models automatically.
+    
+    This allows `optimum-cli export openvino --model funasr/paraformer-zh ...` to work
+    without modifying __main__.py directly.
+    """
+    try:
+        from optimum.exporters.openvino import __main__ as ov_main
+        
+        original_main_export = ov_main.main_export
+        
+        @wraps(original_main_export)
+        def patched_main_export(
+            model_name_or_path: str,
+            output: Union[str, Path],
+            task: str = "auto",
+            **kwargs
+        ):
+            # Check if this is a Paraformer model
+            if _is_paraformer_model(model_name_or_path, cache_dir=kwargs.get("cache_dir")):
+                logger.info("Detected Paraformer model (FunASR). Using specialized export.")
+                
+                # Get ov_config for quantization settings
+                ov_config = kwargs.get("ov_config")
+                
+                # Determine weight format from kwargs
+                weight_format = kwargs.get("weight_format", "fp16")
+                if weight_format is None:
+                    weight_format = "fp16"
+                
+                # Check ov_config for quantization settings to determine weight_format
+                if ov_config is not None:
+                    quant_config = getattr(ov_config, "quantization_config", None)
+                    if quant_config is not None:
+                        if hasattr(quant_config, 'bits') and quant_config.bits == 8:
+                            weight_format = "int8"
+                        elif hasattr(quant_config, 'weight_dtype') and 'int8' in str(quant_config.weight_dtype).lower():
+                            weight_format = "int8"
+                
+                export_paraformer_to_openvino(
+                    model_name_or_path=model_name_or_path,
+                    output=output,
+                    weight_format=weight_format,
+                    cache_dir=kwargs.get("cache_dir"),
+                    token=kwargs.get("token"),
+                    device=kwargs.get("device", "cpu"),
+                    ov_config=ov_config,
+                )
+                return
+            
+            # Not a Paraformer model, use original export
+            return original_main_export(model_name_or_path, output, task, **kwargs)
+        
+        # Apply the patch
+        ov_main.main_export = patched_main_export
+        logger.debug("Patched main_export to support Paraformer models")
+        
+    except Exception as e:
+        logger.warning(f"Could not patch main_export for Paraformer support: {e}")
+
+
+# Auto-register when this module is imported
+register_paraformer_with_tasks_manager()
+patch_main_export()
+patch_main_quantize()
+
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
index dd110267ea..07ca64db81 100644
--- a/optimum/intel/__init__.py
+++ b/optimum/intel/__init__.py
@@ -104,6 +104,7 @@
             "OVModelForSpeechSeq2Seq",
             "OVModelForTextToSpeechSeq2Seq",
             "OVModelForVision2Seq",
+            "OVParaformerForSpeechSeq2Seq",
             "OVModelForVisualCausalLM",
             "OVModelForSequenceClassification",
             "OVModelForTokenClassification",
@@ -132,6 +133,7 @@
         "OVModelForSpeechSeq2Seq",
         "OVModelForTextToSpeechSeq2Seq",
         "OVModelForVision2Seq",
+        "OVParaformerForSpeechSeq2Seq",
         "OVModelForVisualCausalLM",
         "OVModelForSequenceClassification",
         "OVModelForTokenClassification",
@@ -416,6 +418,7 @@
             OVModelForSpeechSeq2Seq,
             OVModelForTextToSpeechSeq2Seq,
             OVModelForTokenClassification,
+            OVParaformerForSpeechSeq2Seq,
             OVModelForVision2Seq,
             OVModelForVisualCausalLM,
             OVModelForZeroShotImageClassification,
diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
index 8441944800..6c2926e5e3 100644
--- a/optimum/intel/openvino/__init__.py
+++ b/optimum/intel/openvino/__init__.py
@@ -86,6 +86,7 @@
 )
 from .modeling_sam import OVSamModel
 from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq, OVModelForVision2Seq
+from .modeling_speech2text import OVParaformerForSpeechSeq2Seq
 from .modeling_text2speech import OVModelForTextToSpeechSeq2Seq
 from .modeling_visual_language import OVModelForVisualCausalLM
 
diff --git a/optimum/intel/openvino/modeling_speech2text.py b/optimum/intel/openvino/modeling_speech2text.py
new file mode 100644
index 0000000000..89c057d6c2
--- /dev/null
+++ b/optimum/intel/openvino/modeling_speech2text.py
@@ -0,0 +1,850 @@
+#  Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""
+OpenVINO Paraformer Speech-to-Text Model Implementation
+Following the pattern from optimum-intel's modeling_text2speech.py
+"""
+
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from tempfile import gettempdir
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import openvino
+from openvino import CompiledModel, Core, Model
+import torch
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from transformers import AutoConfig, PretrainedConfig
+from transformers.utils import ModelOutput
+
+from .utils import OV_DECODER_NAME, OV_ENCODER_NAME, OV_XML_FILE_NAME, OV_TO_PT_TYPE
+
+logger = logging.getLogger(__name__)
+
+core = Core()
+
+# Additional model file name for Paraformer predictor
+OV_PREDICTOR_NAME = "openvino_predictor_model.xml"
+
+
+@dataclass
+class ParaformerModelOutput(ModelOutput):
+    """
+    Output type of ParaformerModel.
+    
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, vocab_size)`):
+            Predicted logits for each token.
+        token_num (`torch.LongTensor` of shape `(batch_size,)`):
+            Number of predicted tokens for each sequence.
+        token_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Decoded token IDs (if `decode=True`).
+    """
+    logits: torch.FloatTensor = None
+    token_num: torch.LongTensor = None
+    token_ids: torch.LongTensor = None
+
+
+class OVParaformerModelPart:
+    """
+    Base class for OpenVINO Paraformer model components.
+    Following the OVModelPart pattern from optimum-intel.
+    """
+    _model_name = "model"
+    
+    def __init__(
+        self,
+        model: Union[Model, CompiledModel],
+        parent_model: "OVParaformerForSpeechSeq2Seq",
+        ov_config: Optional[Dict[str, str]] = None,
+        model_name: str = None,
+    ):
+        self.model = model
+        self.parent_model = parent_model
+        self._model_name = model_name or self._model_name
+        
+        self._compile_only = getattr(parent_model, '_compile_only', False)
+        self.ov_config = ov_config or getattr(parent_model, 'ov_config', {}).copy()
+        
+        # Initialize request
+        if self._compile_only and isinstance(model, CompiledModel):
+            self.request = model.create_infer_request()
+        else:
+            self.request = None
+        
+        # Extract input/output metadata
+        model_for_meta = model.get_runtime_model() if isinstance(model, CompiledModel) else model
+        
+        self.input_names = {}
+        self.input_dtypes = {}
+        for idx, inp in enumerate(model_for_meta.inputs):
+            try:
+                names = inp.get_names()
+                name = next((n for n in names if "/" not in n), list(names)[0] if names else f"input_{idx}")
+            except Exception:
+                name = f"input_{idx}"
+            self.input_names[name] = idx
+            self.input_dtypes[name] = inp.get_element_type().get_type_name()
+        
+        self.output_names = {}
+        self.output_dtypes = {}
+        for idx, out in enumerate(model_for_meta.outputs):
+            try:
+                names = out.get_names()
+                name = next((n for n in names if "/" not in n), list(names)[0] if names else f"output_{idx}")
+            except Exception:
+                name = f"output_{idx}"
+            self.output_names[name] = idx
+            self.output_dtypes[name] = out.get_element_type().get_type_name()
+    
+    @property
+    def _device(self) -> str:
+        return self.parent_model._device
+    
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
+    
+    @property
+    def dtype(self) -> Optional[torch.dtype]:
+        for dtype in self.input_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype is not None and torch_dtype.is_floating_point:
+                return torch_dtype
+        for dtype in self.output_dtypes.values():
+            torch_dtype = OV_TO_PT_TYPE.get(dtype)
+            if torch_dtype is not None and torch_dtype.is_floating_point:
+                return torch_dtype
+        return None
+    
+    def compile(self):
+        """Compile the model for inference."""
+        if self._compile_only and isinstance(self.model, CompiledModel):
+            if self.request is None:
+                self.request = self.model.create_infer_request()
+            return
+        
+        if self.request is None:
+            # Set cache directory for GPU
+            model_dir = getattr(self.parent_model, 'model_save_dir', None)
+            if (
+                model_dir is not None
+                and "CACHE_DIR" not in self.ov_config
+                and not str(model_dir).startswith(gettempdir())
+                and "gpu" in self._device.lower()
+            ):
+                self.ov_config["CACHE_DIR"] = os.path.join(str(model_dir), self._model_name, "model_cache")
+            
+            logger.info(f"Compiling {self._model_name} to {self._device}...")
+            compiled_model = core.compile_model(self.model, self._device, self.ov_config)
+            self.request = compiled_model.create_infer_request()
+            logger.info(f"✅ {self._model_name} compiled successfully")
+    
+    def clear_requests(self):
+        """Clear inference request to free resources."""
+        if self._compile_only:
+            raise ValueError("`clear_requests()` is not supported in `compile_only` mode")
+        self.request = None
+    
+    def _prepare_input(self, tensor: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+        """Convert input to numpy array."""
+        if isinstance(tensor, torch.Tensor):
+            return tensor.cpu().numpy()
+        return tensor
+    
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+    
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class OVParaformerEncoder(OVParaformerModelPart):
+    """
+    Paraformer Encoder component for OpenVINO inference.
+    
+    Processes input speech features and produces encoder hidden states.
+    """
+    _model_name = "encoder"
+    
+    def forward(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        speech_lengths: Union[torch.Tensor, np.ndarray],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass through the encoder.
+        
+        Args:
+            speech: Input speech features [batch, time, features]
+            speech_lengths: Valid lengths for each sequence [batch]
+        
+        Returns:
+            encoder_out: Encoded hidden states [batch, time, hidden]
+            encoder_out_lens: Output lengths [batch]
+        """
+        self.compile()
+        
+        inputs = {
+            "speech": self._prepare_input(speech),
+            "speech_lengths": self._prepare_input(speech_lengths),
+        }
+        
+        self.request.infer(inputs)
+        
+        encoder_out = torch.from_numpy(self.request.get_output_tensor(0).data.copy())
+        encoder_out_lens = torch.from_numpy(self.request.get_output_tensor(1).data.copy())
+        
+        return encoder_out, encoder_out_lens
+
+
+class OVParaformerPredictor(OVParaformerModelPart):
+    """
+    Paraformer CIF Predictor component for OpenVINO inference.
+    
+    Predicts acoustic embeddings and token counts from encoder output.
+    """
+    _model_name = "predictor"
+    
+    def forward(
+        self,
+        encoder_out: Union[torch.Tensor, np.ndarray],
+        encoder_out_lens: Union[torch.Tensor, np.ndarray],
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Forward pass through the CIF predictor.
+        
+        Args:
+            encoder_out: Encoder output [batch, time, hidden]
+            encoder_out_lens: Encoder output lengths [batch]
+        
+        Returns:
+            acoustic_embeds: Predicted acoustic embeddings [batch, token_num, hidden]
+            token_num: Number of predicted tokens [batch]
+            alphas: CIF weights [batch, time] (optional)
+            peak_index: Peak indices [batch, token_num] (optional)
+        """
+        self.compile()
+        
+        # Create attention mask [batch, 1, max_len]
+        if isinstance(encoder_out, torch.Tensor):
+            batch_size, max_len = encoder_out.shape[0], encoder_out.shape[1]
+            arange = torch.arange(max_len, dtype=torch.int32).unsqueeze(0).expand(batch_size, -1)
+            mask = (arange < encoder_out_lens.unsqueeze(1).to(torch.int32)).to(torch.float32)
+            mask = mask.unsqueeze(1)
+        else:
+            batch_size, max_len = encoder_out.shape[0], encoder_out.shape[1]
+            arange = np.arange(max_len, dtype=np.int32)[np.newaxis, :].repeat(batch_size, axis=0)
+            mask = (arange < encoder_out_lens[:, np.newaxis]).astype(np.float32)
+            mask = mask[:, np.newaxis, :]
+        
+        # Map encoder_out and mask to actual OV input names using discovered input_names
+        # to avoid mismatch with TorchScript arg names
+        input_names_list = list(self.input_names.keys())
+        inputs = {}
+        if len(input_names_list) > 0:
+            inputs[input_names_list[0]] = self._prepare_input(encoder_out)
+        if len(input_names_list) > 1:
+            inputs[input_names_list[1]] = self._prepare_input(mask)
+        
+        self.request.infer(inputs)
+        
+        acoustic_embeds = torch.from_numpy(self.request.get_output_tensor(0).data.copy())
+        token_num = torch.from_numpy(self.request.get_output_tensor(1).data.copy())
+        
+        alphas = None
+        peak_index = None
+        if len(self.output_names) > 2:
+            alphas = torch.from_numpy(self.request.get_output_tensor(2).data.copy())
+        if len(self.output_names) > 3:
+            peak_index = torch.from_numpy(self.request.get_output_tensor(3).data.copy())
+        
+        return acoustic_embeds, token_num, alphas, peak_index
+
+
+class OVParaformerDecoder(OVParaformerModelPart):
+    """
+    Paraformer Decoder component for OpenVINO inference.
+    
+    Produces output logits from encoder output and acoustic embeddings.
+    """
+    _model_name = "decoder"
+    
+    def forward(
+        self,
+        encoder_out: Union[torch.Tensor, np.ndarray],
+        encoder_out_lens: Union[torch.Tensor, np.ndarray],
+        acoustic_embeds: Union[torch.Tensor, np.ndarray],
+        token_num: Union[torch.Tensor, np.ndarray],
+    ) -> torch.Tensor:
+        """
+        Forward pass through the decoder.
+        
+        Args:
+            encoder_out: Encoder output [batch, time, hidden]
+            encoder_out_lens: Encoder output lengths [batch]
+            acoustic_embeds: Acoustic embeddings from predictor [batch, token_num, hidden]
+            token_num: Number of tokens [batch]
+        
+        Returns:
+            logits: Output logits [batch, token_num, vocab_size]
+        """
+        self.compile()
+        
+        inputs = {
+            "encoder_out": self._prepare_input(encoder_out),
+            "encoder_out_lens": self._prepare_input(encoder_out_lens),
+            "acoustic_embeds": self._prepare_input(acoustic_embeds),
+            "token_num": self._prepare_input(token_num),
+        }
+        
+        self.request.infer(inputs)
+        
+        logits = torch.from_numpy(self.request.get_output_tensor(0).data.copy())
+        
+        return logits
+
+
+class OVParaformerForSpeechSeq2Seq:
+    """
+    OpenVINO Paraformer model for automatic speech recognition.
+    
+    This class provides a unified interface for loading and running inference
+    on Paraformer models exported to OpenVINO IR format. It supports both
+    single-file models and multi-component (encoder/predictor/decoder) models.
+    
+    Following the pattern from optimum-intel's OVModelForTextToSpeechSeq2Seq.
+    
+    Args:
+        model_path: Path to the model directory containing OpenVINO IR files
+        device: Target device for inference (CPU, GPU, AUTO, etc.)
+        ov_config: OpenVINO runtime configuration dictionary
+        compile_only: If True, skip model loading and compile directly from files
+        
+    Example:
+        ```python
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            "/path/to/paraformer-zh/ov_models",
+            device="GPU",
+        )
+        
+        # Run inference
+        output = model(speech_features, speech_lengths)
+        token_ids = output.token_ids
+        ```
+    """
+    
+    auto_model_class = None
+    export_feature = "automatic-speech-recognition"
+    main_input_name = "speech"
+    
+    def __init__(
+        self,
+        model: Optional[Model] = None,
+        encoder: Optional[Model] = None,
+        predictor: Optional[Model] = None,
+        decoder: Optional[Model] = None,
+        config: Optional[PretrainedConfig] = None,
+        device: str = "CPU",
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path]] = None,
+        compile_only: bool = False,
+        compile: bool = True,
+        **kwargs,
+    ):
+        self.config = config
+        self.model_save_dir = Path(model_save_dir) if model_save_dir else None
+        self._device = device.upper()
+        self.ov_config = ov_config.copy() if ov_config else {}
+        self._compile_only = compile_only
+        self.preprocessors = kwargs.get("preprocessors", [])
+        self.generation_config = kwargs.get("generation_config", None)
+        
+        # Determine if we have a single model or separate components
+        self._single_model = model is not None
+        
+        if self._single_model:
+            # Single combined model
+            self.model = model
+            self._model_component = OVParaformerModelPart(
+                model, self, ov_config=self.ov_config, model_name="model"
+            )
+            self.encoder = None
+            self.predictor = None
+            self.decoder = None
+            
+            # Extract I/O metadata from the single model
+            self.input_names = self._model_component.input_names.copy()
+            self.output_names = self._model_component.output_names.copy()
+        else:
+            # Separate components
+            self.model = None
+            self._model_component = None
+            self.encoder = OVParaformerEncoder(encoder, self, model_name="encoder") if encoder else None
+            self.predictor = OVParaformerPredictor(predictor, self, model_name="predictor") if predictor else None
+            self.decoder = OVParaformerDecoder(decoder, self, model_name="decoder") if decoder else None
+            
+            # Combine I/O names
+            self.input_names = {}
+            self.output_names = {}
+            if self.encoder:
+                self.input_names.update(self.encoder.input_names)
+            if self.decoder:
+                self.output_names.update(self.decoder.output_names)
+        
+        if compile and not compile_only:
+            self.compile()
+    
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        device: str = "CPU",
+        ov_config: Optional[Dict[str, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        local_files_only: bool = False,
+        compile_only: bool = False,
+        compile: bool = True,
+        **kwargs,
+    ) -> "OVParaformerForSpeechSeq2Seq":
+        """
+        Load a Paraformer model from a local directory or Hugging Face Hub.
+        
+        Args:
+            model_id: Local path or Hugging Face Hub model ID
+            device: Target device (CPU, GPU, AUTO)
+            ov_config: OpenVINO configuration dictionary
+            token: Hugging Face authentication token
+            revision: Model revision to use
+            force_download: Force re-download from Hub
+            cache_dir: Directory to cache downloaded models
+            local_files_only: Only use local files, no Hub download
+            compile_only: Load as compiled model directly
+            compile: Whether to compile models after loading
+            
+        Returns:
+            OVParaformerForSpeechSeq2Seq instance
+        """
+        model_path = Path(model_id)
+        
+        # Try to load config
+        config = None
+        config_paths = [
+            model_path / "config.json",
+            model_path / "config.yaml",
+        ]
+        for cfg_path in config_paths:
+            if cfg_path.exists():
+                try:
+                    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+                    break
+                except Exception:
+                    pass
+        
+        # Check for single model file
+        single_model_path = model_path / OV_XML_FILE_NAME
+        if single_model_path.exists():
+            logger.info(f"Loading single Paraformer model from {single_model_path}")
+            model = cls._load_model(single_model_path, device if compile_only else None, ov_config)
+            return cls(
+                model=model,
+                config=config,
+                device=device,
+                ov_config=ov_config,
+                model_save_dir=model_path,
+                compile_only=compile_only,
+                compile=compile,
+                **kwargs,
+            )
+        
+        # Check for separate component files
+        encoder_path = model_path / OV_ENCODER_NAME
+        predictor_path = model_path / OV_PREDICTOR_NAME
+        decoder_path = model_path / OV_DECODER_NAME
+        
+        if encoder_path.exists() and decoder_path.exists():
+            logger.info(f"Loading Paraformer components from {model_path}")
+            
+            encoder = cls._load_model(encoder_path, device if compile_only else None, ov_config)
+            decoder = cls._load_model(decoder_path, device if compile_only else None, ov_config)
+            predictor = None
+            if predictor_path.exists():
+                predictor = cls._load_model(predictor_path, device if compile_only else None, ov_config)
+            
+            return cls(
+                encoder=encoder,
+                predictor=predictor,
+                decoder=decoder,
+                config=config,
+                device=device,
+                ov_config=ov_config,
+                model_save_dir=model_path,
+                compile_only=compile_only,
+                compile=compile,
+                **kwargs,
+            )
+        
+        raise FileNotFoundError(
+            f"Could not find Paraformer model files in {model_path}. "
+            f"Expected either '{OV_XML_FILE_NAME}' or component files like '{OV_ENCODER_NAME}'."
+        )
+    
+    @staticmethod
+    def _load_model(
+        path: Path,
+        device: Optional[str] = None,
+        ov_config: Optional[Dict[str, str]] = None,
+    ) -> Union[Model, CompiledModel]:
+        """Load an OpenVINO model from file."""
+        logger.info(f"Loading model from {path}")
+        model = core.read_model(path)
+        
+        if device is not None:
+            # Compile directly (compile_only mode)
+            return core.compile_model(model, device, ov_config or {})
+        
+        return model
+    
+    @property
+    def device(self) -> torch.device:
+        """Return torch device (always CPU for compatibility)."""
+        return torch.device("cpu")
+    
+    @property
+    def dtype(self) -> torch.dtype:
+        """Return model dtype."""
+        if self._model_component:
+            return self._model_component.dtype
+        if self.encoder:
+            return self.encoder.dtype
+        return torch.float32
+    
+    @property
+    def _component_names(self) -> List[str]:
+        """Return list of loaded component names."""
+        if self._single_model:
+            return ["model"]
+        names = []
+        if self.encoder: names.append("encoder")
+        if self.predictor: names.append("predictor")
+        if self.decoder: names.append("decoder")
+        return names
+    
+    @property
+    def components(self) -> Dict[str, OVParaformerModelPart]:
+        """Return dictionary of model components."""
+        if self._single_model:
+            return {"model": self._model_component}
+        comps = {}
+        if self.encoder: comps["encoder"] = self.encoder
+        if self.predictor: comps["predictor"] = self.predictor
+        if self.decoder: comps["decoder"] = self.decoder
+        return comps
+    
+    def to(self, device: str) -> "OVParaformerForSpeechSeq2Seq":
+        """
+        Move model to specified device.
+        
+        Args:
+            device: Target device (CPU, GPU, AUTO)
+            
+        Returns:
+            self for method chaining
+        """
+        if self._compile_only:
+            raise ValueError("`to()` is not supported in `compile_only` mode")
+        
+        if isinstance(device, str):
+            self._device = device.upper()
+            self.clear_requests()
+        
+        return self
+    
+    def compile(self):
+        """Compile all model components for inference."""
+        for component in self.components.values():
+            component.compile()
+    
+    def clear_requests(self):
+        """Clear all inference requests."""
+        for component in self.components.values():
+            component.clear_requests()
+    
+    def __call__(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        speech_lengths: Union[torch.Tensor, np.ndarray],
+        decode: bool = True,
+        **kwargs,
+    ) -> ParaformerModelOutput:
+        """
+        Run inference on speech input.
+        
+        Args:
+            speech: Input speech features [batch, time, features]
+            speech_lengths: Valid lengths for each sequence [batch]
+            decode: Whether to decode logits to token IDs
+            
+        Returns:
+            ParaformerModelOutput containing logits, token_num, and optionally token_ids
+        """
+        return self.forward(speech, speech_lengths, decode=decode, **kwargs)
+    
+    def forward(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        speech_lengths: Union[torch.Tensor, np.ndarray],
+        decode: bool = True,
+        **kwargs,
+    ) -> ParaformerModelOutput:
+        """
+        Forward pass through the model.
+        
+        Args:
+            speech: Input speech features [batch, time, features]
+            speech_lengths: Valid lengths for each sequence [batch]
+            decode: Whether to decode logits to token IDs
+            
+        Returns:
+            ParaformerModelOutput containing logits, token_num, and optionally token_ids
+        """
+        if self._single_model:
+            return self._forward_single_model(speech, speech_lengths, decode=decode)
+        else:
+            return self._forward_components(speech, speech_lengths, decode=decode)
+    
+    def _forward_single_model(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        speech_lengths: Union[torch.Tensor, np.ndarray],
+        decode: bool = True,
+    ) -> ParaformerModelOutput:
+        """Forward pass for single combined model."""
+        self._model_component.compile()
+        
+        # Find speech input name (might be 'speech' or 'speech.1')
+        speech_input_name = None
+        for name in self.input_names:
+            if 'speech' in name.lower() and 'length' not in name.lower():
+                speech_input_name = name
+                break
+        
+        if speech_input_name is None:
+            # Fall back to first input
+            speech_input_name = list(self.input_names.keys())[0]
+        
+        # Prepare inputs
+        speech_np = speech.cpu().numpy() if isinstance(speech, torch.Tensor) else speech
+        lengths_np = speech_lengths.cpu().numpy() if isinstance(speech_lengths, torch.Tensor) else speech_lengths
+        
+        inputs = {
+            speech_input_name: speech_np,
+            "speech_lengths": lengths_np,
+        }
+        
+        # Run inference
+        self._model_component.request.infer(inputs)
+        
+        # Get outputs
+        logits = torch.from_numpy(self._model_component.request.get_output_tensor(0).data.copy())
+        token_num = None
+        if len(self.output_names) > 1:
+            token_num = torch.from_numpy(self._model_component.request.get_output_tensor(1).data.copy())
+        
+        # Decode if requested
+        token_ids = None
+        if decode:
+            token_ids = self.decode(logits, token_num)
+        
+        return ParaformerModelOutput(
+            logits=logits,
+            token_num=token_num,
+            token_ids=token_ids,
+        )
+    
+    def _forward_components(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        speech_lengths: Union[torch.Tensor, np.ndarray],
+        decode: bool = True,
+    ) -> ParaformerModelOutput:
+        """Forward pass for separate component models."""
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+        
+        # 2. Predictor (if available)
+        if self.predictor is not None:
+            acoustic_embeds, token_num, alphas, peak_index = self.predictor(
+                encoder_out, encoder_out_lens
+            )
+        else:
+            # Without predictor, pass encoder output directly
+            acoustic_embeds = encoder_out
+            token_num = encoder_out_lens
+        
+        # 3. Decoder
+        logits = self.decoder(encoder_out, encoder_out_lens, acoustic_embeds, token_num)
+        
+        # Decode if requested
+        token_ids = None
+        if decode:
+            token_ids = self.decode(logits, token_num)
+        
+        return ParaformerModelOutput(
+            logits=logits,
+            token_num=token_num,
+            token_ids=token_ids,
+        )
+    
+    def decode(
+        self,
+        logits: torch.Tensor,
+        token_num: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Decode logits to token IDs using greedy decoding.
+        
+        Args:
+            logits: Output logits [batch, seq_len, vocab_size]
+            token_num: Optional token numbers for each batch item [batch]
+            
+        Returns:
+            token_ids: Predicted token IDs [batch, seq_len]
+        """
+        token_ids = torch.argmax(logits, dim=-1)
+        
+        # Mask out padding if token_num is provided
+        if token_num is not None:
+            batch_size = token_ids.shape[0]
+            max_len = token_ids.shape[1]
+            for i in range(batch_size):
+                num = int(token_num[i].item()) if torch.is_tensor(token_num[i]) else int(token_num[i])
+                if num < max_len:
+                    token_ids[i, num:] = 0
+        
+        return token_ids
+    
+    def generate(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        speech_lengths: Union[torch.Tensor, np.ndarray],
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Generate token IDs from speech input.
+        
+        This is an alias for forward() with decode=True for API compatibility.
+        
+        Args:
+            speech: Input speech features [batch, time, features]
+            speech_lengths: Valid lengths for each sequence [batch]
+            
+        Returns:
+            token_ids: Predicted token IDs [batch, seq_len]
+            token_num: Number of valid tokens per sequence [batch]
+        """
+        output = self.forward(speech, speech_lengths, decode=True, **kwargs)
+        return output.token_ids, output.token_num
+    
+    def save_pretrained(
+        self,
+        save_directory: Union[str, Path],
+    ):
+        """
+        Save model to directory.
+        
+        Args:
+            save_directory: Directory to save model files
+        """
+        save_path = Path(save_directory)
+        save_path.mkdir(parents=True, exist_ok=True)
+        
+        if self._single_model:
+            model_path = save_path / OV_XML_FILE_NAME
+            openvino.save_model(self.model, str(model_path))
+            logger.info(f"Saved model to {model_path}")
+        else:
+            if self.encoder:
+                encoder_path = save_path / OV_ENCODER_NAME
+                openvino.save_model(self.encoder.model, str(encoder_path))
+                logger.info(f"Saved encoder to {encoder_path}")
+            if self.predictor:
+                predictor_path = save_path / OV_PREDICTOR_NAME
+                openvino.save_model(self.predictor.model, str(predictor_path))
+                logger.info(f"Saved predictor to {predictor_path}")
+            if self.decoder:
+                decoder_path = save_path / OV_DECODER_NAME
+                openvino.save_model(self.decoder.model, str(decoder_path))
+                logger.info(f"Saved decoder to {decoder_path}")
+        
+        # Save config if available
+        if self.config is not None:
+            self.config.save_pretrained(save_path)
+
+
+# Alias for backwards compatibility
+OVModelForSpeech2Seq = OVParaformerForSpeechSeq2Seq
+load_paraformer_model = OVParaformerForSpeechSeq2Seq.from_pretrained
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Paraformer OpenVINO Inference")
+    parser.add_argument("--model", required=True, help="Path to OpenVINO model directory")
+    parser.add_argument("--device", default="CPU", help="Device (CPU/GPU/AUTO)")
+    parser.add_argument("--input", help="Path to input speech .npy file")
+    parser.add_argument("--lengths", help="Path to lengths .npy file")
+    
+    args = parser.parse_args()
+    
+    # Enable logging
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
+    
+    # Load model
+    print(f"Loading model from {args.model}")
+    model = OVParaformerForSpeechSeq2Seq.from_pretrained(args.model, device=args.device)
+    print(f"✅ Model loaded on {args.device}")
+    print(f"   Components: {model._component_names}")
+    print(f"   Input names: {list(model.input_names.keys())}")
+    print(f"   Output names: {list(model.output_names.keys())}")
+    
+    # Load or create input
+    if args.input and args.lengths:
+        speech = torch.from_numpy(np.load(args.input))
+        speech_lengths = torch.from_numpy(np.load(args.lengths))
+        print(f"Loaded input: speech {speech.shape}, lengths {speech_lengths.shape}")
+    else:
+        # Create dummy input
+        speech = torch.randn(1, 100, 560)
+        speech_lengths = torch.tensor([100], dtype=torch.int32)
+        print("Using dummy input: speech [1, 100, 560]")
+    
+    # Run inference
+    print("\nRunning inference...")
+    output = model(speech, speech_lengths)
+    
+    print(f"\n✅ Inference completed!")
+    print(f"   Logits shape: {output.logits.shape}")
+    print(f"   Token numbers: {output.token_num}")
+    if output.token_ids is not None:
+        num = int(output.token_num[0]) if output.token_num is not None else 10
+        print(f"   Token IDs (first {num}): {output.token_ids[0, :num].tolist()}")
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 818eb41726..2b23f78d36 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -204,6 +204,16 @@
         "split": "validation",
         "streaming": True,
         "revision": "refs/convert/parquet",
+    },
+    "AISHELL-1": {
+        "id": "speechcolab/aishell",
+        "split": "validation",
+        "streaming": True,
+    },
+    "aishell-1": {
+        "id": "speechcolab/aishell",
+        "split": "validation",
+        "streaming": True,
     }
 }
 
diff --git a/optimum/intel/pipelines/accelerator_utils.py b/optimum/intel/pipelines/accelerator_utils.py
index 7ea4102ec7..7ee9d0d8af 100644
--- a/optimum/intel/pipelines/accelerator_utils.py
+++ b/optimum/intel/pipelines/accelerator_utils.py
@@ -80,6 +80,7 @@
         OVModelForVision2Seq,
         OVModelForVisualCausalLM,
         OVModelForZeroShotImageClassification,
+        OVParaformerForSpeechSeq2Seq,
     )
     from ..openvino.modeling_base import OVBaseModel
 
@@ -87,7 +88,7 @@
         "audio-classification": (OVModelForAudioClassification,),
         "audio-frame-classification": (OVModelForAudioFrameClassification,),
         "audio-xvector": (OVModelForAudioXVector,),
-        "automatic-speech-recognition": (OVModelForCTC, OVModelForSpeechSeq2Seq),
+        "automatic-speech-recognition": (OVModelForCTC, OVModelForSpeechSeq2Seq, OVParaformerForSpeechSeq2Seq),
         "feature-extraction": (OVModelForFeatureExtraction,),
         "fill-mask": (OVModelForMaskedLM,),
         "image-classification": (OVModelForImageClassification,),
@@ -128,6 +129,12 @@ def get_openvino_model_class(
             config = AutoConfig.from_pretrained(model_id, **hub_kwargs)
         if any(arch.endswith("ForCTC") for arch in config.architectures):
             ov_model_class = OV_TASKS_MAPPING[task][0]
+        # Check for Paraformer models - detected by model_type or architecture
+        elif (
+            getattr(config, "model_type", "").lower() == "paraformer"
+            or any("Paraformer" in arch for arch in getattr(config, "architectures", []))
+        ):
+            ov_model_class = OV_TASKS_MAPPING[task][2]  # OVParaformerForSpeechSeq2Seq
         else:
             ov_model_class = OV_TASKS_MAPPING[task][1]
     else:
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index cab9e5efa3..ba67e4df4a 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -257,6 +257,9 @@ def _infer_library_from_model_name_or_path(
     )
     if "open_clip_config.json" in all_files or "open_clip_pytorch_model.bin" in all_files:
         library_name = "open_clip"
+    elif "am.mvn" in all_files and "config.yaml" in all_files and "tokens.json" in all_files:
+        # Paraformer models have these characteristic files
+        library_name = "paraformer"
     else:
         library_name = TasksManager._infer_library_from_model_name_or_path(
             model_name_or_path=model_name_or_path, cache_dir=cache_dir
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 9519cea1ec..3f2bb9df0c 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -46,6 +46,7 @@
     OVModelForSequenceClassification,
     OVModelForSpeechSeq2Seq,
     OVModelForTextToSpeechSeq2Seq,
+    OVParaformerForSpeechSeq2Seq,
     OVModelForTokenClassification,
     OVModelForVisualCausalLM,
     OVModelForZeroShotImageClassification,
@@ -79,6 +80,7 @@ class ExportModelTest(unittest.TestCase):
         "roberta": OVModelForTokenClassification,
         "wav2vec2": OVModelForAudioClassification,
         "whisper": OVModelForSpeechSeq2Seq,
+        "paraformer": OVParaformerForSpeechSeq2Seq,
         "blenderbot": OVModelForFeatureExtraction,
         "stable-diffusion": OVStableDiffusionPipeline,
         "stable-diffusion-xl": OVStableDiffusionXLPipeline,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 8e860ba743..89a3c949b2 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -54,6 +54,7 @@
     OVModelForSequenceClassification,
     OVModelForSpeechSeq2Seq,
     OVModelForTextToSpeechSeq2Seq,
+    OVParaformerForSpeechSeq2Seq,
     OVModelForTokenClassification,
     OVModelForVisualCausalLM,
     OVModelForZeroShotImageClassification,
@@ -113,6 +114,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("text-to-video", "ltx-video"),
         ("feature-extraction", "sam"),
         ("text-to-audio", "speecht5"),
+        ("automatic-speech-recognition", "paraformer"),
         ("zero-shot-image-classification", "clip"),
     ]
 
diff --git a/tests/openvino/test_paraformer.py b/tests/openvino/test_paraformer.py
new file mode 100644
index 0000000000..97c00a600e
--- /dev/null
+++ b/tests/openvino/test_paraformer.py
@@ -0,0 +1,315 @@
+#  Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import os
+import unittest
+from tempfile import TemporaryDirectory
+
+import numpy as np
+import torch
+from transformers import set_seed
+
+from optimum.intel import OVParaformerForSpeechSeq2Seq
+
+
+# Note: This test requires a Paraformer OpenVINO model to be available.
+# For CI/CD, this should point to a model on Hugging Face Hub once available.
+PARAFORMER_MODEL_PATH = os.environ.get(
+    "PARAFORMER_TEST_MODEL",
+    None  # Set to model path when available on HF Hub
+)
+
+OPENVINO_DEVICE = os.environ.get("OPENVINO_DEVICE", "CPU")
+SEED = 42
+
+
+class OVParaformerForSpeechSeq2SeqTest(unittest.TestCase):
+    """
+    Test suite for OVParaformerForSpeechSeq2Seq model.
+    
+    This tests the OpenVINO inference implementation for Paraformer ASR models.
+    """
+    
+    def _generate_random_speech_features(self, batch_size=1, num_frames=100, feature_dim=560):
+        """Generate random speech features for testing."""
+        np.random.seed(SEED)
+        speech = np.random.randn(batch_size, num_frames, feature_dim).astype(np.float32)
+        speech_lengths = np.array([num_frames] * batch_size, dtype=np.int32)
+        return speech, speech_lengths
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_load_model_from_pretrained(self):
+        """Test loading model from pretrained path."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        # Check model properties
+        self.assertIsNotNone(model)
+        self.assertEqual(model._device, OPENVINO_DEVICE)
+        self.assertIsNotNone(model.input_names)
+        self.assertIsNotNone(model.output_names)
+        self.assertEqual(model.export_feature, "automatic-speech-recognition")
+        self.assertEqual(model.main_input_name, "speech")
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_model_inference(self):
+        """Test basic inference functionality."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        # Generate random input
+        speech, speech_lengths = self._generate_random_speech_features(batch_size=1, num_frames=100)
+        speech_tensor = torch.from_numpy(speech)
+        lengths_tensor = torch.from_numpy(speech_lengths)
+        
+        # Run inference
+        output = model(speech_tensor, lengths_tensor)
+        
+        # Check output structure
+        self.assertIsNotNone(output.logits)
+        self.assertIsNotNone(output.token_num)
+        self.assertIsNotNone(output.token_ids)
+        
+        # Check shapes
+        batch_size, seq_len, vocab_size = output.logits.shape
+        self.assertEqual(batch_size, 1)
+        self.assertGreater(seq_len, 0)
+        self.assertGreater(vocab_size, 0)
+        
+        # Check token_ids shape matches
+        self.assertEqual(output.token_ids.shape[0], batch_size)
+        self.assertEqual(output.token_ids.shape[1], seq_len)
+        
+        # Check token_num is within bounds
+        self.assertGreater(output.token_num[0].item(), 0)
+        self.assertLessEqual(output.token_num[0].item(), seq_len)
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_batch_inference(self):
+        """Test batch inference with variable lengths."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        # Generate batch with different lengths
+        batch_size = 3
+        max_frames = 120
+        speech = np.random.randn(batch_size, max_frames, 560).astype(np.float32)
+        speech_lengths = np.array([120, 100, 80], dtype=np.int32)
+        
+        speech_tensor = torch.from_numpy(speech)
+        lengths_tensor = torch.from_numpy(speech_lengths)
+        
+        # Run batch inference
+        output = model(speech_tensor, lengths_tensor)
+        
+        # Check batch dimension
+        self.assertEqual(output.logits.shape[0], batch_size)
+        self.assertEqual(output.token_ids.shape[0], batch_size)
+        self.assertEqual(len(output.token_num), batch_size)
+        
+        # Check all sequences have tokens
+        for i in range(batch_size):
+            self.assertGreater(output.token_num[i].item(), 0)
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_numpy_input(self):
+        """Test inference with numpy arrays as input."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        # Use numpy arrays directly
+        speech, speech_lengths = self._generate_random_speech_features()
+        
+        # Run inference with numpy input
+        output = model(speech, speech_lengths)
+        
+        # Should work the same as torch tensors
+        self.assertIsNotNone(output.logits)
+        self.assertIsNotNone(output.token_ids)
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_generate_api(self):
+        """Test the generate() API."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        speech, speech_lengths = self._generate_random_speech_features()
+        speech_tensor = torch.from_numpy(speech)
+        lengths_tensor = torch.from_numpy(speech_lengths)
+        
+        # Use generate() method
+        token_ids, token_num = model.generate(speech_tensor, lengths_tensor)
+        
+        # Check outputs
+        self.assertIsInstance(token_ids, torch.Tensor)
+        self.assertIsInstance(token_num, torch.Tensor)
+        self.assertEqual(token_ids.shape[0], 1)  # batch size
+        self.assertGreater(token_num[0].item(), 0)
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_device_switching(self):
+        """Test switching between CPU and GPU."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device="CPU"
+        )
+        
+        self.assertEqual(model._device, "CPU")
+        
+        speech, speech_lengths = self._generate_random_speech_features()
+        speech_tensor = torch.from_numpy(speech)
+        lengths_tensor = torch.from_numpy(speech_lengths)
+        
+        # Run on CPU
+        output_cpu = model(speech_tensor, lengths_tensor)
+        self.assertIsNotNone(output_cpu.logits)
+        
+        # Switch to GPU (if available)
+        try:
+            model.to("GPU")
+            self.assertEqual(model._device, "GPU")
+            
+            # Run on GPU
+            output_gpu = model(speech_tensor, lengths_tensor)
+            self.assertIsNotNone(output_gpu.logits)
+            
+            # Results should be similar (not exactly equal due to precision differences)
+            self.assertEqual(output_cpu.logits.shape, output_gpu.logits.shape)
+        except Exception as e:
+            # GPU might not be available in test environment
+            self.skipTest(f"GPU not available: {e}")
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_save_and_load(self):
+        """Test saving and loading model."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        with TemporaryDirectory() as tmp_dir:
+            # Save model
+            model.save_pretrained(tmp_dir)
+            
+            # Check files were created
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "openvino_model.xml")))
+            self.assertTrue(os.path.exists(os.path.join(tmp_dir, "openvino_model.bin")))
+            
+            # Load saved model
+            loaded_model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+                tmp_dir,
+                device=OPENVINO_DEVICE
+            )
+            
+            # Test loaded model works
+            speech, speech_lengths = self._generate_random_speech_features()
+            output = loaded_model(torch.from_numpy(speech), torch.from_numpy(speech_lengths))
+            self.assertIsNotNone(output.logits)
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_decode_without_token_num(self):
+        """Test decode method without token_num (should not mask)."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        # Create fake logits
+        batch_size, seq_len, vocab_size = 1, 10, 100
+        fake_logits = torch.randn(batch_size, seq_len, vocab_size)
+        
+        # Decode without token_num
+        token_ids = model.decode(fake_logits, token_num=None)
+        
+        # Should return argmax of logits
+        expected = torch.argmax(fake_logits, dim=-1)
+        self.assertTrue(torch.equal(token_ids, expected))
+        
+        # Decode with token_num (should mask padding)
+        token_num = torch.tensor([5])
+        token_ids_masked = model.decode(fake_logits, token_num=token_num)
+        
+        # First 5 should be same, rest should be 0
+        self.assertTrue(torch.equal(token_ids_masked[0, :5], expected[0, :5]))
+        self.assertTrue(torch.all(token_ids_masked[0, 5:] == 0))
+    
+    @unittest.skipIf(PARAFORMER_MODEL_PATH is None, "Paraformer model path not provided")
+    def test_model_properties(self):
+        """Test model properties and attributes."""
+        model = OVParaformerForSpeechSeq2Seq.from_pretrained(
+            PARAFORMER_MODEL_PATH,
+            device=OPENVINO_DEVICE
+        )
+        
+        # Check component names
+        self.assertIn("model", model._component_names)
+        
+        # Check components dictionary
+        self.assertIsNotNone(model.components)
+        self.assertGreater(len(model.components), 0)
+        
+        # Check dtype
+        self.assertIsNotNone(model.dtype)
+        
+        # Check device property
+        self.assertEqual(model.device, torch.device("cpu"))
+    
+    def test_model_output_dataclass(self):
+        """Test ParaformerModelOutput dataclass."""
+        from optimum.intel.openvino.modeling_speech2text import ParaformerModelOutput
+        
+        # Create output with all fields
+        logits = torch.randn(1, 10, 100)
+        token_num = torch.tensor([10])
+        token_ids = torch.randint(0, 100, (1, 10))
+        
+        output = ParaformerModelOutput(
+            logits=logits,
+            token_num=token_num,
+            token_ids=token_ids
+        )
+        
+        # Check all fields are accessible
+        self.assertEqual(output.logits.shape, logits.shape)
+        self.assertEqual(output.token_num, token_num)
+        self.assertEqual(output.token_ids.shape, token_ids.shape)
+        
+        # Check optional field
+        output_no_ids = ParaformerModelOutput(
+            logits=logits,
+            token_num=token_num
+        )
+        self.assertIsNone(output_no_ids.token_ids)
+
+
+if __name__ == "__main__":
+    # For local testing with your models
+    if PARAFORMER_MODEL_PATH is None:
+        print("=" * 80)
+        print("WARNING: PARAFORMER_TEST_MODEL environment variable not set")
+        print("To run tests locally, set:")
+        print("  export PARAFORMER_TEST_MODEL=/path/to/paraformer-zh/ov_models")
+        print("=" * 80)
+    
+    unittest.main()
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index fe6d584d2f..e685a88c52 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -213,6 +213,7 @@
     "wav2vec2-hf": "optimum-intel-internal-testing/tiny-random-Wav2Vec2Model",
     "wav2vec2-conformer": "optimum-intel-internal-testing/tiny-random-wav2vec2-conformer",
     "whisper": "optimum-intel-internal-testing/tiny-random-whisper",
+    "paraformer": "funasr/paraformer-zh",
     "xlm": "optimum-intel-internal-testing/tiny-random-xlm",
     "xlm-roberta": "optimum-intel-internal-testing/tiny-xlm-roberta",
     "xglm": "optimum-intel-internal-testing/tiny-random-XGLMForCausalLM",
@@ -244,6 +245,7 @@
     "granitemoehybrid": {"model": 118},
     "wav2vec2": {"model": 34},
     "distilbert": {"model": 66},
+    "paraformer": {"model": 268},
     "t5": {
         "encoder": 64,
         "decoder": 104,