diff --git a/benchmark/benchmark_ppdet.py b/benchmark/benchmark_ppdet.py
index 8f7033db4f7..6544c7e6011 100755
--- a/benchmark/benchmark_ppdet.py
+++ b/benchmark/benchmark_ppdet.py
@@ -17,7 +17,7 @@
 import os
 import numpy as np
 import time
-
+from tqdm import tqdm
 
 def parse_arguments():
     import argparse
@@ -263,6 +263,9 @@ def cpu_stat_func(self, q, pid, interval=0.0):
         elif "yolov3" in args.model:
             model = fd.vision.detection.YOLOv3(
                 model_file, params_file, config_file, runtime_option=option)
+        elif "yolov8" in args.model:
+            model = fd.vision.detection.PaddleYOLOv8(
+                model_file, params_file, config_file, runtime_option=option)
         elif "ppyolo_r50vd_dcn_1x_coco" in args.model or "ppyolov2_r101vd_dcn_365e_coco" in args.model:
             model = fd.vision.detection.PPYOLO(
                 model_file, params_file, config_file, runtime_option=option)
@@ -284,7 +287,7 @@ def cpu_stat_func(self, q, pid, interval=0.0):
 
         model.enable_record_time_of_runtime()
         im_ori = cv2.imread(args.image)
-        for i in range(args.iter_num):
+        for i in tqdm(range(args.iter_num)):
             im = im_ori
             start = time.time()
             result = model.predict(im)
diff --git a/docs/cn/faq/rknpu2/rknpu2.md b/docs/cn/faq/rknpu2/rknpu2.md
index fcd3e7c35c2..81f35bd431d 100644
--- a/docs/cn/faq/rknpu2/rknpu2.md
+++ b/docs/cn/faq/rknpu2/rknpu2.md
@@ -13,14 +13,22 @@ ONNX模型不能直接调用RK芯片中的NPU进行运算，需要把ONNX模型
 * ARM CPU使用ONNX框架进行测试
 * NPU均使用单核进行测试
 
-| 任务场景           | 模型                                                                                       | 模型版本(表示已经测试的版本)          | ARM CPU/RKNN速度(ms) |
-|----------------|------------------------------------------------------------------------------------------|--------------------------|--------------------|
-| Detection      | [Picodet](../../../../examples/vision/detection/paddledetection/rknpu2/README.md)        | Picodet-s                | 162/112            |
-| Detection      | [RKYOLOV5](../../../../examples/vision/detection/rkyolo/README.md)                       | YOLOV5-S-Relu(int8)      | -/57               |
-| Detection      | [RKYOLOX](../../../../examples/vision/detection/rkyolo/README.md)                        | -                        | -/-                |
-| Detection      | [RKYOLOV7](../../../../examples/vision/detection/rkyolo/README.md)                       | -                        | -/-                |
-| Segmentation   | [Unet](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md)              | Unet-cityscapes          | -/-                |
-| Segmentation   | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md) | portrait(int8)           | 133/43             |
-| Segmentation   | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md) | human(int8)              | 133/43             |
-| Face Detection | [SCRFD](../../../../examples/vision/facedet/scrfd/rknpu2/README.md)                      | SCRFD-2.5G-kps-640(int8) | 108/42             |
-| Classification | [ResNet](../../../../examples/vision/classification/paddleclas/rknpu2/README.md)         | ResNet50_vd              | -/33               |
+| 任务场景                 | 模型                                                                                       | 模型版本(表示已经测试的版本)          | ARM CPU/RKNN速度(ms) |
+|----------------------|------------------------------------------------------------------------------------------|--------------------------|--------------------|
+| Detection            | [Picodet](../../../../examples/vision/detection/paddledetection/rknpu2/README.md)        | Picodet-s                | 162/112            |
+| Detection            | [RKYOLOV5](../../../../examples/vision/detection/rkyolo/README.md)                       | YOLOV5-S-Relu(int8)      | -/57               |
+| Detection            | [RKYOLOX](../../../../examples/vision/detection/rkyolo/README.md)                        | -                        | -/-                |
+| Detection            | [RKYOLOV7](../../../../examples/vision/detection/rkyolo/README.md)                       | -                        | -/-                |
+| Segmentation         | [Unet](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md)              | Unet-cityscapes          | -/-                |
+| Segmentation         | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md) | portrait(int8)           | 133/43             |
+| Segmentation         | [PP-HumanSegV2Lite](../../../../examples/vision/segmentation/paddleseg/rknpu2/README.md) | human(int8)              | 133/43             |
+| Face Detection       | [SCRFD](../../../../examples/vision/facedet/scrfd/rknpu2/README.md)                      | SCRFD-2.5G-kps-640(int8) | 108/42             |
+| Face FaceRecognition | [InsightFace](../../../../examples/vision/faceid/insightface/rknpu2/README_CN.md)        | ms1mv3_arcface_r18(int8) | 81/12              |
+| Classification       | [ResNet](../../../../examples/vision/classification/paddleclas/rknpu2/README.md)         | ResNet50_vd              | -/33               |
+
+## 预编译库下载
+
+为了方便大家进行开发，这里提供1.0.2版本的FastDeploy给大家使用
+
+- [FastDeploy RK356X c++ SDK](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-rk356X-1.0.2.tgz)
+- [FastDeploy RK3588 c++ SDK](https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-aarch64-rk3588-1.0.2.tgz)
diff --git a/examples/audio/silero-vad/README.md b/examples/audio/silero-vad/README.md
new file mode 100644
index 00000000000..8b7e21f4ef3
--- /dev/null
+++ b/examples/audio/silero-vad/README.md
@@ -0,0 +1,41 @@
+English | [简体中文](README_CN.md)
+
+# Silero VAD - pre-trained enterprise-grade Voice Activity Detector
+
+The deployment model comes from [silero-vad](https://github.com/snakers4/silero-vad)
+
+![](https://user-images.githubusercontent.com/36505480/198026365-8da383e0-5398-4a12-b7f8-22c2c0059512.png)
+
+## Key Features
+
+* Stellar accuracy
+
+Silero VAD has excellent results on speech detection tasks.
+
+* Fast
+
+One audio chunk (30+ ms) takes less than 1ms to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably.
+
+* General
+
+Silero VAD was trained on huge corpora that include over 100 languages and it performs well on audios from different domains with various background noise and quality levels.
+
+* Flexible sampling rate
+
+Silero VAD supports 8000 Hz and 16000 Hz sampling rates.
+
+## Download Pre-trained ONNX Model
+
+For developers' testing, model exported by VAD are provided below. Developers can download them directly.
+
+| 模型                                                         | 大小  | 备注                                                         |
+| :----------------------------------------------------------- | :---- | :----------------------------------------------------------- |
+| [silero-vad](https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz) | 1.8MB | This model file is sourced from [snakers4/silero-vad](https://github.com/snakers4/silero-vad)，MIT License |
+
+## Detailed Deployment Documents
+
+- [C++ deployment](cpp)
+
+## Source
+
+[https://github.com/snakers4/silero-vad](https://github.com/snakers4/silero-vad)
diff --git a/examples/audio/silero-vad/README_CN.md b/examples/audio/silero-vad/README_CN.md
new file mode 100644
index 00000000000..81825cf647c
--- /dev/null
+++ b/examples/audio/silero-vad/README_CN.md
@@ -0,0 +1,40 @@
+简体中文 ｜ [English](README.md)
+
+# Silero VAD 预训练的企业级语音活动检测器
+
+该部署模型来自于 [silero-vad](https://github.com/snakers4/silero-vad)
+
+![](https://user-images.githubusercontent.com/36505480/198026365-8da383e0-5398-4a12-b7f8-22c2c0059512.png)
+
+## 主要特征
+
+* 高准确率
+
+Silero VAD在语音检测任务上有着优异的成绩。
+
+* 快速推理
+
+一个音频块（30+ 毫秒）在单个 CPU 线程上处理时间不到 1毫秒。
+
+* 通用性
+
+Silero VAD 在包含100多种语言的庞大语料库上进行了训练，它在来自不同领域、具有不同背景噪音和质量水平的音频上表现良好。
+
+* 灵活采样率
+
+Silero VAD支持 8000 Hz和16000 Hz 采样率。
+
+## 下载预训练ONNX模型
+
+为了方便开发者的测试，下面提供了 VAD 导出模型，开发者可直接下载使用。
+| 模型                                                         | 大小  | 备注                                                         |
+| :----------------------------------------------------------- | :---- | :----------------------------------------------------------- |
+| [silero-vad](https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz) | 1.8MB | 此模型文件来源于[snakers4/silero-vad](https://github.com/snakers4/silero-vad)，MIT License |
+
+## 详细部署文档
+
+- [C++ 部署](cpp)
+
+## 模型来源
+
+[https://github.com/snakers4/silero-vad](https://github.com/snakers4/silero-vad)
diff --git a/examples/audio/silero-vad/cpp/CMakeLists.txt b/examples/audio/silero-vad/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..004d1931681
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.23)
+project(silero_vad)
+
+set(CMAKE_CXX_STANDARD 11)
+
+# 指定下载解压后的fastdeploy库路径
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+# 添加FastDeploy依赖头文件
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_onnx_silero_vad ${PROJECT_SOURCE_DIR}/infer_onnx_silero_vad.cc wav.h vad.cc vad.h)
+
+# 添加FastDeploy库依赖
+target_link_libraries(infer_onnx_silero_vad ${FASTDEPLOY_LIBS})
diff --git a/examples/audio/silero-vad/cpp/README.md b/examples/audio/silero-vad/cpp/README.md
new file mode 100644
index 00000000000..f032be86230
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/README.md
@@ -0,0 +1,121 @@
+English | [简体中文](README_CN.md)
+
+# Silero VAD Deployment Example
+
+This directory provides examples that `infer_onnx_silero_vad` fast finishes the deployment of VAD models on CPU/GPU.
+
+Before deployment, two steps require confirmation.
+
+- 1. Software and hardware should meet the requirements. Please refer to [FastDeploy Environment Requirements](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).  
+- 2. Download the precompiled deployment library and samples code according to your development environment. Refer to [FastDeploy Precompiled Library](../../../../docs/en/build_and_install/download_prebuilt_libraries.md).
+
+Taking VAD inference on Linux as an example, the compilation test can be completed by executing the following command in this directory.
+
+```bash
+mkdir build
+cd build
+# Download the FastDeploy precompiled library. Users can choose your appropriate version in the `FastDeploy Precompiled Library` mentioned above
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# Download the VAD model file and test audio. After decompression, place the model and test audio in the infer_onnx_silero_vad.cc peer directory
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
+
+# inference
+./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
+```
+
+- The above command works for Linux or MacOS. Refer to:
+  - [How to use FastDeploy C++ SDK in Windows](../../../../docs/en/faq/use_sdk_on_windows.md)  for SDK use-pattern in Windows
+
+## VAD C++ Interface
+
+### Vad Class
+
+```c++
+Vad::Vad(const std::string& model_file,
+    const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
+```
+
+**Parameter**
+
+> * **model_file**(str): Model file path
+> * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
+
+### setAudioCofig function
+
+**Must be called before the `init` function**
+
+```c++
+void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
+```
+
+**Parameter**
+
+> * **sr**(int): sampling rate
+> * **frame_ms**(int): The length of each detection frame, and it is used to calculate the detection window size
+> * **threshold**(float): Result probability judgment threshold
+> * **min_silence_duration_ms**(int): The threshold used to calculate whether it is silence
+> * **speech_pad_ms**(int): Used to calculate the end time of the speech
+
+### init function
+
+Used to initialize audio-related parameters.
+
+```c++
+void Vad::init();
+```
+
+### loadAudio function
+
+Load audio.
+
+```c++
+void Vad::loadAudio(const std::string& wavPath)
+```
+
+**Parameter**
+
+> * **wavPath**(str): Audio file path
+
+### Predict function
+
+Used to start model reasoning.
+
+```c++
+bool Vad::Predict();
+```
+
+### getResult function
+
+**Used to obtain reasoning results**
+
+```c++
+std::vector<std::map<std::string, float>> Vad::getResult(
+            float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
+            float mergeThreshold = 0.3);
+```
+
+**Parameter**
+
+> * **removeThreshold**(float): Discard result fragment threshold; If some recognition results are too short, they will be discarded according to this threshold
+> * **expandHeadThreshold**(float): Offset at the beginning of the segment; The recognized start time may be too close to the voice part, so move forward the start time accordingly
+> * **expandTailThreshold**(float): Offset at the end of the segment; The recognized end time may be too close to the voice part, so the end time is moved back accordingly
+> * **mergeThreshold**(float): Some result segments are very close and can be combined into one, and the vocal segments can be combined accordingly
+
+**The output result format is**`std::vector<std::map<std::string, float>>`
+
+> Output a list, each element is a speech fragment
+>
+> Each clip can use 'start' to get the start time and 'end' to get the end time
+
+### Tips
+
+1. `The setAudioCofig`function must be called before the `init` function
+2. The sampling rate of the input audio file must be consistent with that set in the code
+
+- [Model Description](../)
+- [How to switch the model inference backend engine](../../../../docs/en/faq/how_to_change_backend.md)
diff --git a/examples/audio/silero-vad/cpp/README_CN.md b/examples/audio/silero-vad/cpp/README_CN.md
new file mode 100644
index 00000000000..c45d9896c38
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/README_CN.md
@@ -0,0 +1,119 @@
+[English](README.md) | 简体中文
+# Silero VAD 部署示例
+
+本目录下提供`infer_onnx_silero_vad`快速完成 Silero VAD 模型在CPU/GPU。
+
+在部署前，需确认以下两个步骤
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+- 2. 根据开发环境，下载预编译部署库和samples代码，参考[FastDeploy预编译库](../../../../docs/cn/build_and_install/download_prebuilt_libraries.md)
+
+以Linux上 VAD 推理为例，在本目录执行如下命令即可完成编译测试。
+
+```bash
+mkdir build
+cd build
+# 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# 下载 VAD 模型文件和测试音频，解压后将模型和测试音频放置在与 infer_onnx_silero_vad.cc 同级目录下
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad.tgz
+wget https://bj.bcebos.com/paddlehub/fastdeploy/silero_vad_sample.wav
+
+# 推理
+./infer_onnx_silero_vad ../silero_vad.onnx ../silero_vad_sample.wav
+```
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## VAD C++ 接口
+### Vad 类
+
+```c++
+Vad::Vad(const std::string& model_file,
+    const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption())
+```
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+
+### setAudioCofig 函数
+
+**必须在`init`函数前调用**
+
+```c++
+void Vad::setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms);
+```
+
+**参数**
+
+> * **sr**(int): 采样率
+> * **frame_ms**(int): 每次检测帧长，用于计算检测窗口大小
+> * **threshold**(float): 结果概率判断阈值
+> * **min_silence_duration_ms**(int): 用于计算判断是否是 silence 的阈值
+> * **speech_pad_ms**(int): 用于计算 speach 结束时刻
+
+### init 函数
+
+用于初始化音频相关参数
+
+```c++
+void Vad::init();
+```
+
+### loadAudio 函数
+
+加载音频
+
+```c++
+void Vad::loadAudio(const std::string& wavPath)
+```
+
+**参数**
+
+> * **wavPath**(str): 音频文件路径
+
+### Predict 函数
+
+用于开始模型推理
+
+```c++
+bool Vad::Predict();
+```
+
+### getResult 函数
+
+**用于获取推理结果**
+
+```c++
+std::vector<std::map<std::string, float>> Vad::getResult(
+            float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
+            float mergeThreshold = 0.3);
+```
+
+**参数**
+
+> * **removeThreshold**(float): 丢弃结果片段阈值；部分识别结果太短则根据此阈值丢弃
+> * **expandHeadThreshold**(float): 结果片段开始时刻偏移；识别到的开始时刻可能过于贴近发声部分，因此据此前移开始时刻
+> * **expandTailThreshold**(float): 结果片段结束时刻偏移；识别到的结束时刻可能过于贴近发声部分，因此据此后移结束时刻
+> * **mergeThreshold**(float): 有的结果片段十分靠近，可以合并成一个，据此合并发声片段
+
+**输出结果格式为**`std::vector<std::map<std::string, float>>`
+
+> 输出一个列表，每个元素是一个讲话片段
+>
+> 每个片段可以用 'start' 获取到开始时刻，用 'end' 获取到结束时刻
+
+### 提示
+
+1. `setAudioCofig`函数必须在`init`函数前调用
+2. 输入的音频文件的采样率必须与代码中设置的保持一致
+
+- [模型介绍](../)
+- [如何切换模型推理后端引擎](../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/audio/silero-vad/cpp/infer_onnx_silero_vad.cc b/examples/audio/silero-vad/cpp/infer_onnx_silero_vad.cc
new file mode 100644
index 00000000000..a23898550c3
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/infer_onnx_silero_vad.cc
@@ -0,0 +1,29 @@
+#include <iostream>
+
+#include "vad.h"
+
+int main(int argc, char* argv[]) {
+  if (argc < 3) {
+    std::cout << "Usage: infer_onnx_silero_vad path/to/model path/to/audio "
+                 "run_option, "
+                 "e.g ./infer_onnx_silero_vad silero_vad.onnx sample.wav"
+              << std::endl;
+    return -1;
+  }
+
+  std::string model_file = argv[1];
+  std::string audio_file = argv[2];
+
+  Vad vad(model_file);
+  // custom config, but must be set before init
+  // vad.setAudioCofig(16000, 64, 0.5f, 0, 0);
+  vad.init();
+  vad.loadAudio(audio_file);
+  vad.Predict();
+  std::vector<std::map<std::string, float>> result = vad.getResult();
+  for (auto& res : result) {
+    std::cout << "speak start: " << res["start"] << " s, end: " << res["end"]
+              << " s" << std::endl;
+  }
+  return 0;
+}
diff --git a/examples/audio/silero-vad/cpp/vad.cc b/examples/audio/silero-vad/cpp/vad.cc
new file mode 100644
index 00000000000..5b451605905
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/vad.cc
@@ -0,0 +1,258 @@
+// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "vad.h"
+
+int Vad::getSampleRate() const { return sample_rate_; }
+
+int Vad::getFrameMs() const { return frame_ms_; }
+
+float Vad::getThreshold() const { return threshold_; }
+
+int Vad::getMinSilenceDurationMs() const { return min_silence_duration_ms_; }
+
+int Vad::getSpeechPadMs() const { return speech_pad_ms_; }
+
+const wav::WavReader &Vad::getWavReader() const { return wavReader_; }
+
+const std::vector<int16_t> &Vad::getData() const { return data_; }
+
+const std::vector<float> &Vad::getInputWav() const { return inputWav_; }
+
+int64_t Vad::getWindowSizeSamples() const { return window_size_samples_; }
+
+int Vad::getSrPerMs() const { return sr_per_ms_; }
+
+int Vad::getMinSilenceSamples() const { return min_silence_samples_; }
+
+int Vad::getSpeechPadSamples() const { return speech_pad_samples_; }
+
+std::string Vad::ModelName() const { return "VAD"; }
+
+void Vad::loadAudio(const std::string &wavPath) {
+    wavReader_ = wav::WavReader(wavPath);
+    data_.reserve(wavReader_.num_samples());
+    inputWav_.reserve(wavReader_.num_samples());
+
+    for (int i = 0; i < wavReader_.num_samples(); i++) {
+        data_[i] = static_cast<int16_t>(*(wavReader_.data() + i));
+    }
+
+    for (int i = 0; i < wavReader_.num_samples(); i++) {
+        inputWav_[i] = static_cast<float>(data_[i]) / 32768;
+    }
+}
+
+bool Vad::Initialize() {
+    // initAudioConfig
+    sr_per_ms_ = sample_rate_ / 1000;
+    min_silence_samples_ = sr_per_ms_ * min_silence_duration_ms_;
+    speech_pad_samples_ = sr_per_ms_ * speech_pad_ms_;
+    window_size_samples_ = frame_ms_ * sr_per_ms_;
+
+    // initInputConfig
+    input_.resize(window_size_samples_);
+    input_node_dims_.emplace_back(1);
+    input_node_dims_.emplace_back(window_size_samples_);
+
+    _h.resize(size_hc_);
+    _c.resize(size_hc_);
+    sr_.resize(1);
+    sr_[0] = sample_rate_;
+
+    // InitRuntime
+    if (!InitRuntime()) {
+        fastdeploy::FDERROR << "Failed to initialize fastdeploy backend."
+                            << std::endl;
+        return false;
+    }
+    return true;
+}
+
+void Vad::setAudioCofig(int sr, int frame_ms, float threshold,
+                        int min_silence_duration_ms, int speech_pad_ms) {
+    if (initialized) {
+        fastdeploy::FDERROR << "setAudioCofig must be called before init"
+                            << std::endl;
+        throw std::runtime_error("setAudioCofig must be called before init");
+    }
+    sample_rate_ = sr;
+    Vad::frame_ms_ = frame_ms;
+    Vad::threshold_ = threshold;
+    Vad::min_silence_duration_ms_ = min_silence_duration_ms;
+    Vad::speech_pad_ms_ = speech_pad_ms;
+}
+
+bool Vad::Preprocess(std::vector<float> audioWindowData) {
+    fastdeploy::FDTensor inputTensor, srTensor, hTensor, cTensor;
+    inputTensor.SetExternalData(input_node_dims_, fastdeploy::FDDataType::FP32,
+                                audioWindowData.data());
+    inputTensor.name = "input";
+    srTensor.SetExternalData(sr_node_dims_, fastdeploy::FDDataType::INT64,
+                             sr_.data());
+    srTensor.name = "sr";
+    hTensor.SetExternalData(hc_node_dims_, fastdeploy::FDDataType::FP32,
+                            _h.data());
+    hTensor.name = "h";
+    cTensor.SetExternalData(hc_node_dims_, fastdeploy::FDDataType::FP32,
+                            _c.data());
+    cTensor.name = "c";
+
+    inputTensors_.clear();
+    inputTensors_.emplace_back(inputTensor);
+    inputTensors_.emplace_back(srTensor);
+    inputTensors_.emplace_back(hTensor);
+    inputTensors_.emplace_back(cTensor);
+    return true;
+}
+
+bool Vad::Predict() {
+    if (wavReader_.sample_rate() != sample_rate_) {
+        fastdeploy::FDINFO << "The sampling rate of the audio file is " << wavReader_.sample_rate() << std::endl;
+        fastdeploy::FDINFO << "The set sample rate is " << sample_rate_ << std::endl;
+        fastdeploy::FDERROR << "The sampling rate of the audio file is not equal "
+                               "to the sampling rate set by the program. "
+                            << "Please make it equal. "
+                            << "You can modify the audio file sampling rate, "
+                            << "or use setAudioCofig to modify the program's "
+                               "sampling rate and other configurations."
+                            << std::endl;
+        throw std::runtime_error(
+                "The sampling rate of the audio file is not equal to the sampling rate "
+                "set by the program.");
+    }
+    for (int64_t j = 0; j < wavReader_.num_samples(); j += window_size_samples_) {
+        std::vector<float> r{&inputWav_[0] + j,
+                             &inputWav_[0] + j + window_size_samples_};
+        Preprocess(r);
+        if (!Infer(inputTensors_, &outputTensors_)) {
+            fastdeploy::FDERROR << "Failed to inference while using model:"
+                                << ModelName() << "." << std::endl;
+            return false;
+        }
+        Postprocess();
+    }
+    return true;
+}
+
+bool Vad::Postprocess() {
+    // update prob, h, c
+    outputProb_ = *(float *)outputTensors_[0].Data();
+    auto *hn = static_cast<float *>(outputTensors_[1].MutableData());
+    std::memcpy(_h.data(), hn, size_hc_ * sizeof(float));
+    auto *cn = static_cast<float *>(outputTensors_[2].MutableData());
+    std::memcpy(_c.data(), cn, size_hc_ * sizeof(float));
+
+    // Push forward sample index
+    current_sample_ += window_size_samples_;
+
+    if (outputProb_ >= threshold_ && temp_end_) {
+        // Reset temp_end_ when > threshold_
+        temp_end_ = 0;
+    }
+    if (outputProb_ < threshold_ && !triggerd_) {
+        // 1) Silence
+        // printf("{ silence: %.3f s }\n", 1.0 * current_sample_ / sample_rate_);
+    }
+    if (outputProb_ >= threshold_ - 0.15 && triggerd_) {
+        // 2) Speaking
+        // printf("{ speaking_2: %.3f s }\n", 1.0 * current_sample_ / sample_rate_);
+    }
+    if (outputProb_ >= threshold_ && !triggerd_) {
+        // 3) Start
+        triggerd_ = true;
+        speech_start_ = current_sample_ - window_size_samples_ -
+                        speech_pad_samples_;  // minus window_size_samples_ to get
+        // precise start time point.
+        // printf("{ start: %.5f s }\n", 1.0 * speech_start_ / sample_rate_);
+        speakStart_.emplace_back(1.0 * speech_start_ / sample_rate_);
+    }
+    if (outputProb_ < threshold_ - 0.15 && triggerd_) {
+        // 4) End
+        if (temp_end_ != 0) {
+            temp_end_ = current_sample_;
+        }
+        if (current_sample_ - temp_end_ < min_silence_samples_) {
+            // a. silence < min_slience_samples, continue speaking
+            // printf("{ speaking_4: %.3f s }\n", 1.0 * current_sample_ / sample_rate_);
+            // printf("");
+        } else {
+            // b. silence >= min_slience_samples, end speaking
+            speech_end_ = current_sample_ + speech_pad_samples_;
+            temp_end_ = 0;
+            triggerd_ = false;
+            // printf("{ end: %.5f s }\n", 1.0 * speech_end_ / sample_rate_);
+            speakEnd_.emplace_back(1.0 * speech_end_ / sample_rate_);
+        }
+    }
+
+    return true;
+}
+
+std::vector<std::map<std::string, float>> Vad::getResult(
+        float removeThreshold, float expandHeadThreshold, float expandTailThreshold,
+        float mergeThreshold) {
+    float audioLength = 1.0 * wavReader_.num_samples() / sample_rate_;
+    if (speakStart_.empty() && speakEnd_.empty()) {
+        return {};
+    }
+    if (speakEnd_.size() != speakStart_.size()) {
+        // set the audio length as the last end
+        speakEnd_.emplace_back(audioLength);
+    }
+    // Remove too short segments
+    auto startIter = speakStart_.begin();
+    auto endIter = speakEnd_.begin();
+    while (startIter != speakStart_.end()) {
+        if (removeThreshold < audioLength &&
+            *endIter - *startIter < removeThreshold) {
+            startIter = speakStart_.erase(startIter);
+            endIter = speakEnd_.erase(endIter);
+        } else {
+            startIter++;
+            endIter++;
+        }
+    }
+    // Expand to avoid to tight cut.
+    startIter = speakStart_.begin();
+    endIter = speakEnd_.begin();
+    *startIter = std::fmax(0.f, *startIter - expandHeadThreshold);
+    *endIter = std::fmin(*endIter + expandTailThreshold, *(startIter + 1));
+    endIter = speakEnd_.end() - 1;
+    startIter = speakStart_.end() - 1;
+    *startIter = fmax(*startIter - expandHeadThreshold, *(endIter - 1));
+    *endIter = std::fmin(*endIter + expandTailThreshold, audioLength);
+    for (int i = 1; i < speakStart_.size() - 1; ++i) {
+        speakStart_[i] = std::fmax(speakStart_[i] - expandHeadThreshold, speakEnd_[i - 1]);
+        speakEnd_[i] = std::fmin(speakEnd_[i] + expandTailThreshold, speakStart_[i + 1]);
+    }
+    // Merge very closed segments
+    startIter = speakStart_.begin() + 1;
+    endIter = speakEnd_.begin();
+    while (startIter != speakStart_.end()) {
+        if (*startIter - *endIter < mergeThreshold) {
+            startIter = speakStart_.erase(startIter);
+            endIter = speakEnd_.erase(endIter);
+        } else {
+            startIter++;
+            endIter++;
+        }
+    }
+
+    std::vector<std::map<std::string, float>> result;
+    for (int i = 0; i < speakStart_.size(); ++i) {
+        result.emplace_back(std::map<std::string, float>(
+                {{"start", speakStart_[i]}, {"end", speakEnd_[i]}}));
+    }
+    return result;
+}
diff --git a/examples/audio/silero-vad/cpp/vad.h b/examples/audio/silero-vad/cpp/vad.h
new file mode 100644
index 00000000000..322e98a4812
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/vad.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+
+#include "fastdeploy/runtime.h"
+#include "fastdeploy/fastdeploy_model.h"
+#include "fastdeploy/vision/utils/utils.h"
+#include "wav.h"
+
+class Vad:public fastdeploy::FastDeployModel{
+ public:
+  std::string ModelName() const override;
+
+  Vad(const std::string& model_file,
+      const fastdeploy::RuntimeOption& custom_option =
+          fastdeploy::RuntimeOption()) {
+    valid_cpu_backends = {fastdeploy::Backend::ORT,
+                          fastdeploy::Backend::OPENVINO};
+    valid_gpu_backends = {fastdeploy::Backend::ORT,
+                          fastdeploy::Backend::TRT};
+
+    runtime_option = custom_option;
+    runtime_option.model_format = fastdeploy::ModelFormat::ONNX;
+    runtime_option.model_file = model_file;
+    runtime_option.params_file = "";
+  }
+
+  void init() {
+    initialized = Initialize();
+  }
+
+  void setAudioCofig(
+      int sr, int frame_ms, float threshold,
+      int min_silence_duration_ms, int speech_pad_ms);
+
+  void loadAudio(const std::string& wavPath);
+
+  bool Predict();
+
+  std::vector<std::map<std::string, float>> getResult(
+      float removeThreshold = 1.6,
+      float expandHeadThreshold = 0.32, float expandTailThreshold = 0,
+      float mergeThreshold = 0.3);
+
+ private:
+  bool Initialize();
+
+  bool Preprocess(std::vector<float> audioWindowData);
+
+  bool Postprocess();
+
+ private:
+  // model
+  std::vector<fastdeploy::FDTensor> inputTensors_;
+  std::vector<fastdeploy::FDTensor> outputTensors_;
+  // model states
+  bool triggerd_ = false;
+  unsigned int speech_start_ = 0;
+  unsigned int speech_end_ = 0;
+  unsigned int temp_end_ = 0;
+  unsigned int current_sample_ = 0;
+  // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
+  float outputProb_;
+
+  /* ======================================================================== */
+
+  // input wav data
+  wav::WavReader wavReader_;
+  std::vector<int16_t> data_;
+  std::vector<float> inputWav_;
+
+  /* ======================================================================== */
+
+  // audio config
+  int sample_rate_ = 16000;
+  int frame_ms_ = 64;
+  float threshold_ = 0.5f;
+  int min_silence_duration_ms_ = 0;
+  int speech_pad_ms_ = 0;
+
+  int64_t window_size_samples_;
+  // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k.
+  int sr_per_ms_;  // Assign when init, support 8 or 16
+  int min_silence_samples_;  // sr_per_ms_ * #ms
+  int speech_pad_samples_;  // usually a
+
+  /* ======================================================================== */
+
+  std::vector<float> input_;
+  std::vector<int64_t> sr_;
+  unsigned int size_hc_ = 2 * 1 * 64;  // It's FIXED.
+  std::vector<float> _h;
+  std::vector<float> _c;
+
+  std::vector<int64_t> input_node_dims_;
+  const std::vector<int64_t> sr_node_dims_ = {1};
+  const std::vector<int64_t> hc_node_dims_ = {2, 1, 64};
+
+  /* ======================================================================== */
+
+  std::vector<float> speakStart_;
+  std::vector<float> speakEnd_;
+
+ public:
+  int getSampleRate() const;
+
+  int getFrameMs() const;
+
+  float getThreshold() const;
+
+  int getMinSilenceDurationMs() const;
+
+  int getSpeechPadMs() const;
+
+  const wav::WavReader &getWavReader() const;
+
+  const std::vector<int16_t> &getData() const;
+
+  const std::vector<float> &getInputWav() const;
+
+  int64_t getWindowSizeSamples() const;
+
+  int getSrPerMs() const;
+
+  int getMinSilenceSamples() const;
+
+  int getSpeechPadSamples() const;
+};
diff --git a/examples/audio/silero-vad/cpp/wav.h b/examples/audio/silero-vad/cpp/wav.h
new file mode 100644
index 00000000000..adb81761c94
--- /dev/null
+++ b/examples/audio/silero-vad/cpp/wav.h
@@ -0,0 +1,193 @@
+// Copyright (c) 2016 Personal (Binbin Zhang)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+namespace wav {
+
+struct WavHeader {
+  char riff[4];  // "riff"
+  unsigned int size;
+  char wav[4];  // "WAVE"
+  char fmt[4];  // "fmt "
+  unsigned int fmt_size;
+  uint16_t format;
+  uint16_t channels;
+  unsigned int sample_rate;
+  unsigned int bytes_per_second;
+  uint16_t block_size;
+  uint16_t bit;
+  char data[4];  // "data"
+  unsigned int data_size;
+};
+
+class WavReader {
+ public:
+  WavReader() : data_(nullptr) {}
+  explicit WavReader(const std::string& filename) { Open(filename); }
+
+  bool Open(const std::string& filename) {
+    FILE* fp = fopen(filename.c_str(), "rb");
+    if (NULL == fp) {
+      std::cout << "Error in read " << filename;
+      return false;
+    }
+
+    WavHeader header;
+    fread(&header, 1, sizeof(header), fp);
+    if (header.fmt_size < 16) {
+      fprintf(stderr,
+              "WaveData: expect PCM format data "
+              "to have fmt chunk of at least size 16.\n");
+      return false;
+    } else if (header.fmt_size > 16) {
+      int offset = 44 - 8 + header.fmt_size - 16;
+      fseek(fp, offset, SEEK_SET);
+      fread(header.data, 8, sizeof(char), fp);
+    }
+    // check "riff" "WAVE" "fmt " "data"
+
+    // Skip any sub-chunks between "fmt" and "data".  Usually there will
+    // be a single "fact" sub chunk, but on Windows there can also be a
+    // "list" sub chunk.
+    while (0 != strncmp(header.data, "data", 4)) {
+      // We will just ignore the data in these chunks.
+      fseek(fp, header.data_size, SEEK_CUR);
+      // read next sub chunk
+      fread(header.data, 8, sizeof(char), fp);
+    }
+
+    num_channel_ = header.channels;
+    sample_rate_ = header.sample_rate;
+    bits_per_sample_ = header.bit;
+    int num_data = header.data_size / (bits_per_sample_ / 8);
+    data_ = new float[num_data];  // Create 1-dim array
+    num_samples_ = num_data / num_channel_;
+
+    for (int i = 0; i < num_data; ++i) {
+      switch (bits_per_sample_) {
+        case 8: {
+          char sample;
+          fread(&sample, 1, sizeof(char), fp);
+          data_[i] = static_cast<float>(sample);
+          break;
+        }
+        case 16: {
+          int16_t sample;
+          fread(&sample, 1, sizeof(int16_t), fp);
+          // std::cout << sample;
+          data_[i] = static_cast<float>(sample);
+          // std::cout << data_[i];
+          break;
+        }
+        case 32: {
+          int sample;
+          fread(&sample, 1, sizeof(int), fp);
+          data_[i] = static_cast<float>(sample);
+          break;
+        }
+        default:
+          fprintf(stderr, "unsupported quantization bits");
+          exit(1);
+      }
+    }
+    fclose(fp);
+    return true;
+  }
+
+  int num_channel() const { return num_channel_; }
+  int sample_rate() const { return sample_rate_; }
+  int bits_per_sample() const { return bits_per_sample_; }
+  int num_samples() const { return num_samples_; }
+  const float* data() const { return data_; }
+
+ private:
+  int num_channel_;
+  int sample_rate_;
+  int bits_per_sample_;
+  int num_samples_;  // sample points per channel
+  float* data_;
+};
+
+class WavWriter {
+ public:
+  WavWriter(const float* data, int num_samples, int num_channel,
+            int sample_rate, int bits_per_sample)
+      : data_(data),
+        num_samples_(num_samples),
+        num_channel_(num_channel),
+        sample_rate_(sample_rate),
+        bits_per_sample_(bits_per_sample) {}
+
+  void Write(const std::string& filename) {
+    FILE* fp = fopen(filename.c_str(), "w");
+    // init char 'riff' 'WAVE' 'fmt ' 'data'
+    WavHeader header;
+    char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
+                           0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
+                           0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                           0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
+    memcpy(&header, wav_header, sizeof(header));
+    header.channels = num_channel_;
+    header.bit = bits_per_sample_;
+    header.sample_rate = sample_rate_;
+    header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
+    header.size = sizeof(header) - 8 + header.data_size;
+    header.bytes_per_second =
+        sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
+    header.block_size = num_channel_ * (bits_per_sample_ / 8);
+
+    fwrite(&header, 1, sizeof(header), fp);
+
+    for (int i = 0; i < num_samples_; ++i) {
+      for (int j = 0; j < num_channel_; ++j) {
+        switch (bits_per_sample_) {
+          case 8: {
+            char sample = static_cast<char>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+          case 16: {
+            int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+          case 32: {
+            int sample = static_cast<int>(data_[i * num_channel_ + j]);
+            fwrite(&sample, 1, sizeof(sample), fp);
+            break;
+          }
+        }
+      }
+    }
+    fclose(fp);
+  }
+
+ private:
+  const float* data_;
+  int num_samples_;  // total float points in data_
+  int num_channel_;
+  int sample_rate_;
+  int bits_per_sample_;
+};
+
+}  // namespace wav
diff --git a/examples/vision/facedet/scrfd/cpp/CMakeLists.txt b/examples/vision/facedet/scrfd/cpp/CMakeLists.txt
index 93540a7e83e..41516387891 100644
--- a/examples/vision/facedet/scrfd/cpp/CMakeLists.txt
+++ b/examples/vision/facedet/scrfd/cpp/CMakeLists.txt
@@ -1,14 +1,15 @@
 PROJECT(infer_demo C CXX)
 CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
 
-# 指定下载解压后的fastdeploy库路径
 option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
 
 include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 
-# 添加FastDeploy依赖头文件
+
 include_directories(${FASTDEPLOY_INCS})
 
-add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
-# 添加FastDeploy库依赖
-target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
+add_executable(infer_with_face_align_demo ${PROJECT_SOURCE_DIR}/infer_with_face_align.cc)
+target_link_libraries(infer_with_face_align_demo ${FASTDEPLOY_LIBS})
+
+add_executable(infer_without_face_align_demo ${PROJECT_SOURCE_DIR}/infer_without_face_align.cc)
+target_link_libraries(infer_without_face_align_demo ${FASTDEPLOY_LIBS})
diff --git a/examples/vision/facedet/scrfd/cpp/README_CN.md b/examples/vision/facedet/scrfd/cpp/README_CN.md
index 1c01173b247..b4e0257630b 100644
--- a/examples/vision/facedet/scrfd/cpp/README_CN.md
+++ b/examples/vision/facedet/scrfd/cpp/README_CN.md
@@ -23,13 +23,21 @@ make -j
 wget https://bj.bcebos.com/paddlehub/fastdeploy/scrfd_500m_bnkps_shape640x640.onnx
 wget https://raw.githubusercontent.com/DefTruth/lite.ai.toolkit/main/examples/lite/resources/test_lite_face_detector_3.jpg
 
+# SCRFD
+# CPU推理
+./infer_without_face_align_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 0
+# GPU推理
+./infer_without_face_align_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 1
+# GPU上TensorRT推理
+./infer_without_face_align_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 2
 
+# SCRFD + FaceAlign
 # CPU推理
-./infer_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 0
+./infer_with_face_align_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 0
 # GPU推理
-./infer_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 1
+./infer_with_face_align_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 1
 # GPU上TensorRT推理
-./infer_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 2
+./infer_with_face_align_demo scrfd_500m_bnkps_shape640x640.onnx test_lite_face_detector_3.jpg 2
 ```
 
 运行完成可视化结果如下图所示
diff --git a/examples/vision/facedet/scrfd/cpp/infer_with_face_align.cc b/examples/vision/facedet/scrfd/cpp/infer_with_face_align.cc
new file mode 100644
index 00000000000..1a32ac67c8b
--- /dev/null
+++ b/examples/vision/facedet/scrfd/cpp/infer_with_face_align.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+void CpuInfer(const std::string& model_file, const std::string& image_file) {
+  auto model = fastdeploy::vision::facedet::SCRFD(model_file);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im_list =
+      fastdeploy::vision::utils::AlignFaceWithFivePoints(im, res);
+  if (!vis_im_list.empty()) {
+    cv::imwrite("vis_result.jpg", vis_im_list[0]);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+}
+
+void GpuInfer(const std::string& model_file, const std::string& image_file) {
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  auto model = fastdeploy::vision::facedet::SCRFD(model_file, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im_list =
+      fastdeploy::vision::utils::AlignFaceWithFivePoints(im, res);
+  if (!vis_im_list.empty()) {
+    cv::imwrite("vis_result.jpg", vis_im_list[0]);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+}
+
+void TrtInfer(const std::string& model_file, const std::string& image_file) {
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  option.UseTrtBackend();
+  option.SetTrtInputShape("images", {1, 3, 640, 640});
+  auto model = fastdeploy::vision::facedet::SCRFD(model_file, "", option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::FaceDetectionResult res;
+  if (!model.Predict(&im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im_list =
+      fastdeploy::vision::utils::AlignFaceWithFivePoints(im, res);
+  if (!vis_im_list.empty()) {
+    cv::imwrite("vis_result.jpg", vis_im_list[0]);
+    std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout
+        << "Usage: infer_demo path/to/model path/to/image run_option, "
+           "e.g ./infer_model scrfd_500m_bnkps_shape640x640.onnx ./test.jpeg 0"
+        << std::endl;
+    std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
+                 "with gpu; 2: run with gpu and use tensorrt backend."
+              << std::endl;
+    return -1;
+  }
+
+  if (std::atoi(argv[3]) == 0) {
+    CpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 1) {
+    GpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 2) {
+    TrtInfer(argv[1], argv[2]);
+  }
+  return 0;
+}
diff --git a/examples/vision/facedet/scrfd/cpp/infer.cc b/examples/vision/facedet/scrfd/cpp/infer_without_face_align.cc
similarity index 100%
rename from examples/vision/facedet/scrfd/cpp/infer.cc
rename to examples/vision/facedet/scrfd/cpp/infer_without_face_align.cc
diff --git a/examples/vision/faceid/insightface/cpp/README_CN.md b/examples/vision/faceid/insightface/cpp/README_CN.md
index ff03789ca2c..7f0dc442c5c 100644
--- a/examples/vision/faceid/insightface/cpp/README_CN.md
+++ b/examples/vision/faceid/insightface/cpp/README_CN.md
@@ -101,7 +101,7 @@ VPL模型加载和初始化，其中model_file为导出的ONNX模型格式。
 #### Predict函数
 
 > ```c++
-> ArcFace::Predict(cv::Mat* im, FaceRecognitionResult* result)
+> ArcFace::Predict(const cv::Mat& im, FaceRecognitionResult* result)
 > ```
 >
 > 模型预测接口，输入图像直接输出检测结果。
@@ -121,8 +121,6 @@ VPL模型加载和初始化，其中model_file为导出的ONNX模型格式。
       通过InsightFaceRecognitionPreprocessor::SetAlpha(std::vector<float>& alpha)来进行修改
 > > * **beta**(vector&lt;float&gt;): 预处理归一化的beta值，计算公式为`x'=x*alpha+beta`，beta默认为[-1.f, -1.f, -1.f],
       通过InsightFaceRecognitionPreprocessor::SetBeta(std::vector<float>& beta)来进行修改
-> > * **permute**(bool): 预处理是否将BGR转换成RGB，默认true,
-      通过InsightFaceRecognitionPreprocessor::SetPermute(bool permute)来进行修改
 
 #### InsightFaceRecognitionPostprocessor成员变量(后处理参数)
 > > * **l2_normalize**(bool): 输出人脸向量之前是否执行l2归一化，默认false,
diff --git a/examples/vision/faceid/insightface/python/README_CN.md b/examples/vision/faceid/insightface/python/README_CN.md
index b30921b7e7f..b5d444c5f9b 100644
--- a/examples/vision/faceid/insightface/python/README_CN.md
+++ b/examples/vision/faceid/insightface/python/README_CN.md
@@ -100,7 +100,6 @@ ArcFace模型加载和初始化，其中model_file为导出的ONNX模型格式
 > > * **size**(list[int]): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112]
 > > * **alpha**(list[float]): 预处理归一化的alpha值，计算公式为`x'=x*alpha+beta`，alpha默认为[1. / 127.5, 1.f / 127.5, 1. / 127.5]
 > > * **beta**(list[float]): 预处理归一化的beta值，计算公式为`x'=x*alpha+beta`，beta默认为[-1.f, -1.f, -1.f]
-> > * **swap_rb**(bool): 预处理是否将BGR转换成RGB，默认True
 
 #### AdaFacePostprocessor的成员变量
 以下变量为AdaFacePostprocessor的成员变量
diff --git a/examples/vision/faceid/insightface/python/infer_arcface.py b/examples/vision/faceid/insightface/python/infer_arcface.py
index 06e8ef00186..c426a35ca3e 100644
--- a/examples/vision/faceid/insightface/python/infer_arcface.py
+++ b/examples/vision/faceid/insightface/python/infer_arcface.py
@@ -3,7 +3,6 @@
 import numpy as np
 
 
-# 余弦相似度
 def cosine_similarity(a, b):
     a = np.array(a)
     b = np.array(b)
@@ -56,24 +55,17 @@ def build_option(args):
 
 args = parse_arguments()
 
-# 配置runtime，加载模型
 runtime_option = build_option(args)
 model = fd.vision.faceid.ArcFace(args.model, runtime_option=runtime_option)
 
-# 加载图片
 face0 = cv2.imread(args.face)  # 0,1 同一个人
 face1 = cv2.imread(args.face_positive)
 face2 = cv2.imread(args.face_negative)  # 0,2 不同的人
 
-# 设置 l2 normalize
-model.postprocessor.l2_normalize = True
-
-# 预测图片检测结果
 result0 = model.predict(face0)
 result1 = model.predict(face1)
 result2 = model.predict(face2)
 
-# 计算余弦相似度
 embedding0 = result0.embedding
 embedding1 = result1.embedding
 embedding2 = result2.embedding
@@ -81,7 +73,6 @@ def build_option(args):
 cosine01 = cosine_similarity(embedding0, embedding1)
 cosine02 = cosine_similarity(embedding0, embedding2)
 
-# 打印结果
 print(result0, end="")
 print(result1, end="")
 print(result2, end="")
diff --git a/examples/vision/faceid/insightface/python/infer_cosface.py b/examples/vision/faceid/insightface/python/infer_cosface.py
index 2bb1292f492..b316057e0a0 100644
--- a/examples/vision/faceid/insightface/python/infer_cosface.py
+++ b/examples/vision/faceid/insightface/python/infer_cosface.py
@@ -3,7 +3,6 @@
 import numpy as np
 
 
-# 余弦相似度
 def cosine_similarity(a, b):
     a = np.array(a)
     b = np.array(b)
@@ -56,24 +55,17 @@ def build_option(args):
 
 args = parse_arguments()
 
-# 配置runtime，加载模型
 runtime_option = build_option(args)
 model = fd.vision.faceid.CosFace(args.model, runtime_option=runtime_option)
 
-# 加载图片
-face0 = cv2.imread(args.face)  # 0,1 同一个人
+face0 = cv2.imread(args.face)
 face1 = cv2.imread(args.face_positive)
-face2 = cv2.imread(args.face_negative)  # 0,2 不同的人
+face2 = cv2.imread(args.face_negative)
 
-# 设置 l2 normalize
-model.postprocessor.l2_normalize = True
-
-# 预测图片检测结果
 result0 = model.predict(face0)
 result1 = model.predict(face1)
 result2 = model.predict(face2)
 
-# 计算余弦相似度
 embedding0 = result0.embedding
 embedding1 = result1.embedding
 embedding2 = result2.embedding
@@ -81,7 +73,6 @@ def build_option(args):
 cosine01 = cosine_similarity(embedding0, embedding1)
 cosine02 = cosine_similarity(embedding0, embedding2)
 
-# 打印结果
 print(result0, end="")
 print(result1, end="")
 print(result2, end="")
diff --git a/examples/vision/faceid/insightface/python/infer_partial_fc.py b/examples/vision/faceid/insightface/python/infer_partial_fc.py
index e81531e6eec..e979e774cdd 100644
--- a/examples/vision/faceid/insightface/python/infer_partial_fc.py
+++ b/examples/vision/faceid/insightface/python/infer_partial_fc.py
@@ -3,7 +3,6 @@
 import numpy as np
 
 
-# 余弦相似度
 def cosine_similarity(a, b):
     a = np.array(a)
     b = np.array(b)
@@ -56,24 +55,18 @@ def build_option(args):
 
 args = parse_arguments()
 
-# 配置runtime，加载模型
 runtime_option = build_option(args)
 model = fd.vision.faceid.PartialFC(args.model, runtime_option=runtime_option)
 
 # 加载图片
-face0 = cv2.imread(args.face)  # 0,1 同一个人
+face0 = cv2.imread(args.face)
 face1 = cv2.imread(args.face_positive)
-face2 = cv2.imread(args.face_negative)  # 0,2 不同的人
+face2 = cv2.imread(args.face_negative)
 
-# 设置 l2 normalize
-model.postprocessor.l2_normalize = True
-
-# 预测图片检测结果
 result0 = model.predict(face0)
 result1 = model.predict(face1)
 result2 = model.predict(face2)
 
-# 计算余弦相似度
 embedding0 = result0.embedding
 embedding1 = result1.embedding
 embedding2 = result2.embedding
@@ -81,7 +74,6 @@ def build_option(args):
 cosine01 = cosine_similarity(embedding0, embedding1)
 cosine02 = cosine_similarity(embedding0, embedding2)
 
-# 打印结果
 print(result0, end="")
 print(result1, end="")
 print(result2, end="")
diff --git a/examples/vision/faceid/insightface/python/infer_vpl.py b/examples/vision/faceid/insightface/python/infer_vpl.py
index 6113ad3df72..8c6f711f3d4 100644
--- a/examples/vision/faceid/insightface/python/infer_vpl.py
+++ b/examples/vision/faceid/insightface/python/infer_vpl.py
@@ -3,7 +3,6 @@
 import numpy as np
 
 
-# 余弦相似度
 def cosine_similarity(a, b):
     a = np.array(a)
     b = np.array(b)
@@ -56,24 +55,17 @@ def build_option(args):
 
 args = parse_arguments()
 
-# 配置runtime，加载模型
 runtime_option = build_option(args)
 model = fd.vision.faceid.VPL(args.model, runtime_option=runtime_option)
 
-# 加载图片
 face0 = cv2.imread(args.face)  # 0,1 同一个人
 face1 = cv2.imread(args.face_positive)
 face2 = cv2.imread(args.face_negative)  # 0,2 不同的人
 
-# 设置 l2 normalize
-model.postprocessor.l2_normalize = True
-
-# 预测图片检测结果
 result0 = model.predict(face0)
 result1 = model.predict(face1)
 result2 = model.predict(face2)
 
-# 计算余弦相似度
 embedding0 = result0.embedding
 embedding1 = result1.embedding
 embedding2 = result2.embedding
@@ -81,7 +73,6 @@ def build_option(args):
 cosine01 = cosine_similarity(embedding0, embedding1)
 cosine02 = cosine_similarity(embedding0, embedding2)
 
-# 打印结果
 print(result0, end="")
 print(result1, end="")
 print(result2, end="")
diff --git a/examples/vision/faceid/insightface/rknpu2/README.md b/examples/vision/faceid/insightface/rknpu2/README.md
new file mode 100644
index 00000000000..01bee2e287f
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/README.md
@@ -0,0 +1,54 @@
+[English](README.md) | 简体中文
+# InsightFace RKNPU准备部署模型
+
+本教程提供InsightFace模型在RKNPU2环境下的部署，模型的详细介绍已经ONNX模型的下载请查看[模型介绍文档](../README.md)。
+
+## 支持模型列表
+目前FastDeploy支持如下模型的部署
+- ArcFace
+- CosFace
+- PartialFC
+- VPL
+
+## 下载预训练ONNX模型
+
+为了方便开发者的测试，下面提供了InsightFace导出的各系列模型，开发者可直接下载使用。（下表中模型的精度来源于源官方库）其中精度指标来源于InsightFace中对各模型的介绍，详情各参考InsightFace中的说明
+
+| 模型                                                                                         | 大小    | 精度 (AgeDB_30) |
+|:-------------------------------------------------------------------------------------------|:------|:--------------|
+| [CosFace-r18](https://bj.bcebos.com/paddlehub/fastdeploy/glint360k_cosface_r18.onnx)       | 92MB  | 97.7          |
+| [CosFace-r34](https://bj.bcebos.com/paddlehub/fastdeploy/glint360k_cosface_r34.onnx)       | 131MB | 98.3          |
+| [CosFace-r50](https://bj.bcebos.com/paddlehub/fastdeploy/glint360k_cosface_r50.onnx)       | 167MB | 98.3          |
+| [CosFace-r100](https://bj.bcebos.com/paddlehub/fastdeploy/glint360k_cosface_r100.onnx)     | 249MB | 98.4          |
+| [ArcFace-r18](https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r18.onnx)          | 92MB  | 97.7          |
+| [ArcFace-r34](https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r34.onnx)          | 131MB | 98.1          |
+| [ArcFace-r50](https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r50.onnx)          | 167MB | -             |
+| [ArcFace-r100](https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r100.onnx)        | 249MB | 98.4          |
+| [ArcFace-r100_lr0.1](https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_r100_lr01.onnx)     | 249MB | 98.4          |
+| [PartialFC-r34](https://bj.bcebos.com/paddlehub/fastdeploy/partial_fc_glint360k_r50.onnx)  | 167MB | -             |
+| [PartialFC-r50](https://bj.bcebos.com/paddlehub/fastdeploy/partial_fc_glint360k_r100.onnx) | 249MB | -             |
+
+
+## 转换为RKNPU模型
+
+```bash
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r18.onnx
+
+python -m paddle2onnx.optimize --input_model ./ms1mv3_arcface_r18/ms1mv3_arcface_r18.onnx \
+                               --output_model ./ms1mv3_arcface_r18/ms1mv3_arcface_r18.onnx \
+                               --input_shape_dict "{'data':[1,3,112,112]}"
+
+python  /Path/To/FastDeploy/tools/rknpu2/export.py \
+        --config_path tools/rknpu2/config/arcface_unquantized.yaml \
+        --target_platform rk3588
+```
+
+## 详细部署文档
+
+- [Python部署](python)
+- [C++部署](cpp)
+
+
+## 版本说明
+
+- 本版本文档和代码基于[InsightFace CommitID:babb9a5](https://github.com/deepinsight/insightface/commit/babb9a5) 编写
diff --git a/examples/vision/faceid/insightface/rknpu2/cpp/CMakeLists.txt b/examples/vision/faceid/insightface/rknpu2/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..ce3b467ba6f
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/cpp/CMakeLists.txt
@@ -0,0 +1,11 @@
+PROJECT(infer_demo C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+include_directories(${FASTDEPLOY_INCS})
+
+add_executable(infer_arcface_demo ${PROJECT_SOURCE_DIR}/infer_arcface.cc)
+target_link_libraries(infer_arcface_demo ${FASTDEPLOY_LIBS})
diff --git a/examples/vision/faceid/insightface/rknpu2/cpp/README.md b/examples/vision/faceid/insightface/rknpu2/cpp/README.md
new file mode 100644
index 00000000000..bb88804cdea
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/cpp/README.md
@@ -0,0 +1,136 @@
+[English](README.md) | 简体中文
+# InsightFace C++部署示例
+
+FastDeploy支持在RKNPU上部署包括ArcFace\CosFace\VPL\Partial_FC在内的InsightFace系列模型。
+
+本目录下提供`infer_arcface.cc`快速完成InsighFace模型包括ArcFace在CPU/RKNPU加速部署的示例。
+
+
+在部署前，需确认以下两个步骤:
+
+1. 软硬件环境满足要求
+2. 根据开发环境，下载预编译部署库或者从头编译FastDeploy仓库
+
+以上步骤请参考[RK2代NPU部署库编译](../../../../../../docs/cn/build_and_install/rknpu2.md)实现
+
+在本目录执行如下命令即可完成编译测试
+
+```bash
+mkdir build
+cd build
+# FastDeploy version need >=1.0.3
+wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
+tar xvf fastdeploy-linux-x64-x.x.x.tgz
+cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
+make -j
+
+# 下载官方转换好的ArcFace模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r18.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/rknpu2/face_demo.zip
+unzip face_demo.zip
+
+# CPU推理
+./infer_arcface_demo ms1mv3_arcface_r100.onnx face_0.jpg face_1.jpg face_2.jpg 0
+# RKNPU推理
+./infer_arcface_demo ms1mv3_arcface_r100.onnx face_0.jpg face_1.jpg face_2.jpg 1
+```
+
+运行完成可视化结果如下图所示
+
+<div width="700">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321537-860bf857-0101-4e92-a74c-48e8658d838c.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184322004-a551e6e4-6f47-454e-95d6-f8ba2f47b516.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321622-d9a494c3-72f3-47f1-97c5-8a2372de491f.JPG">
+</div>
+
+以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
+- [如何在Windows中使用FastDeploy C++ SDK](../../../../../docs/cn/faq/use_sdk_on_windows.md)
+
+## InsightFace C++接口
+
+### ArcFace类
+
+```c++
+fastdeploy::vision::faceid::ArcFace(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+ArcFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+### CosFace类
+
+```c++
+fastdeploy::vision::faceid::CosFace(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+CosFace模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+### PartialFC类
+
+```c++
+fastdeploy::vision::faceid::PartialFC(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+PartialFC模型加载和初始化，其中model_file为导出的ONNX模型格式。
+
+### VPL类
+
+```c++
+fastdeploy::vision::faceid::VPL(
+        const string& model_file,
+        const string& params_file = "",
+        const RuntimeOption& runtime_option = RuntimeOption(),
+        const ModelFormat& model_format = ModelFormat::ONNX)
+```
+
+VPL模型加载和初始化，其中model_file为导出的ONNX模型格式。
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX时，此参数传入空字符串即可
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX格式
+
+#### Predict函数
+
+> ```c++
+> ArcFace::Predict(const cv::Mat& im, FaceRecognitionResult* result)
+> ```
+>
+> 模型预测接口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **im**: 输入图像，注意需为HWC，BGR格式
+> > * **result**: 检测结果，包括检测框，各个框的置信度, FaceRecognitionResult说明参考[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+### 修改预处理以及后处理的参数
+预处理和后处理的参数的需要通过修改InsightFaceRecognitionPostprocessor，InsightFaceRecognitionPreprocessor的成员变量来进行修改。
+
+#### InsightFaceRecognitionPreprocessor成员变量(预处理参数)
+> > * **size**(vector&lt;int&gt;): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112],
+      通过InsightFaceRecognitionPreprocessor::SetSize(std::vector<int>& size)来进行修改
+> > * **alpha**(vector&lt;float&gt;): 预处理归一化的alpha值，计算公式为`x'=x*alpha+beta`，alpha默认为[1. / 127.5, 1.f / 127.5, 1. / 127.5],
+      通过InsightFaceRecognitionPreprocessor::SetAlpha(std::vector<float>& alpha)来进行修改
+> > * **beta**(vector&lt;float&gt;): 预处理归一化的beta值，计算公式为`x'=x*alpha+beta`，beta默认为[-1.f, -1.f, -1.f],
+      通过InsightFaceRecognitionPreprocessor::SetBeta(std::vector<float>& beta)来进行修改
+
+#### InsightFaceRecognitionPostprocessor成员变量(后处理参数)
+> > * **l2_normalize**(bool): 输出人脸向量之前是否执行l2归一化，默认false,
+      InsightFaceRecognitionPostprocessor::SetL2Normalize(bool& l2_normalize)来进行修改
+
+- [模型介绍](../../../)
+- [Python部署](../python)
+- [视觉模型预测结果](../../../../../../docs/api/vision_results/README.md)
+- [如何切换模型推理后端引擎](../../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/faceid/insightface/rknpu2/cpp/infer_arcface.cc b/examples/vision/faceid/insightface/rknpu2/cpp/infer_arcface.cc
new file mode 100644
index 00000000000..f9a4d85ff0c
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/cpp/infer_arcface.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+
+void CpuInfer(const std::string& model_file,
+              const std::vector<std::string>& image_file) {
+  auto model = fastdeploy::vision::faceid::ArcFace(model_file, "");
+
+  cv::Mat face0 = cv::imread(image_file[0]);
+  fastdeploy::vision::FaceRecognitionResult res0;
+  if (!model.Predict(face0, &res0)) {
+    std::cerr << "Prediction Failed." << std::endl;
+  }
+
+  cv::Mat face1 = cv::imread(image_file[1]);
+  fastdeploy::vision::FaceRecognitionResult res1;
+  if (!model.Predict(face1, &res1)) {
+    std::cerr << "Prediction Failed." << std::endl;
+  }
+
+  cv::Mat face2 = cv::imread(image_file[2]);
+  fastdeploy::vision::FaceRecognitionResult res2;
+  if (!model.Predict(face2, &res2)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return;
+  }
+
+  std::cout << "Prediction Done!" << std::endl;
+
+  std::cout << "--- [Face 0]:" << res0.Str();
+  std::cout << "--- [Face 1]:" << res1.Str();
+  std::cout << "--- [Face 2]:" << res2.Str();
+
+  float cosine01 = fastdeploy::vision::utils::CosineSimilarity(
+      res0.embedding, res1.embedding,
+      model.GetPostprocessor().GetL2Normalize());
+  float cosine02 = fastdeploy::vision::utils::CosineSimilarity(
+      res0.embedding, res2.embedding,
+      model.GetPostprocessor().GetL2Normalize());
+  std::cout << "Detect Done! Cosine 01: " << cosine01
+            << ", Cosine 02:" << cosine02 << std::endl;
+}
+
+void RKNPUInfer(const std::string& model_file,
+                const std::vector<std::string>& image_file) {
+  std::string params_file;
+  auto option = fastdeploy::RuntimeOption();
+  option.UseRKNPU2();
+  auto format = fastdeploy::ModelFormat::RKNN;
+  auto model = fastdeploy::vision::faceid::ArcFace(model_file, params_file,
+                                                   option, format);
+  model.GetPreprocessor().DisableNormalize();
+  model.GetPreprocessor().DisablePermute();
+
+  cv::Mat face0 = cv::imread(image_file[0]);
+  fastdeploy::vision::FaceRecognitionResult res0;
+  if (!model.Predict(face0, &res0)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return;
+  }
+
+  cv::Mat face1 = cv::imread(image_file[1]);
+  fastdeploy::vision::FaceRecognitionResult res1;
+  if (!model.Predict(face1, &res1)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return;
+  }
+
+  cv::Mat face2 = cv::imread(image_file[2]);
+  fastdeploy::vision::FaceRecognitionResult res2;
+  if (!model.Predict(face2, &res2)) {
+    std::cerr << "Prediction Failed." << std::endl;
+    return;
+  }
+
+  std::cout << "Prediction Done!" << std::endl;
+
+  std::cout << "--- [Face 0]:" << res0.Str();
+  std::cout << "--- [Face 1]:" << res1.Str();
+  std::cout << "--- [Face 2]:" << res2.Str();
+
+  float cosine01 = fastdeploy::vision::utils::CosineSimilarity(
+      res0.embedding, res1.embedding,
+      model.GetPostprocessor().GetL2Normalize());
+  float cosine02 = fastdeploy::vision::utils::CosineSimilarity(
+      res0.embedding, res2.embedding,
+      model.GetPostprocessor().GetL2Normalize());
+  std::cout << "Detect Done! Cosine 01: " << cosine01
+            << ", Cosine 02:" << cosine02 << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 6) {
+    std::cout << "Usage: infer_demo path/to/model path/to/image run_option, "
+                 "e.g ./infer_arcface_demo ms1mv3_arcface_r100.onnx "
+                 "face_0.jpg face_1.jpg face_2.jpg 0"
+              << std::endl;
+    std::cout << "The data type of run_option is int, "
+                 "0: run with cpu; 1: run with rknpu2."
+              << std::endl;
+    return -1;
+  }
+
+  std::vector<std::string> image_files = {argv[2], argv[3], argv[4]};
+  if (std::atoi(argv[5]) == 0) {
+    CpuInfer(argv[1], image_files);
+  } else if (std::atoi(argv[5]) == 1) {
+    RKNPUInfer(argv[1], image_files);
+  }
+  return 0;
+}
diff --git a/examples/vision/faceid/insightface/rknpu2/python/README_CN.md b/examples/vision/faceid/insightface/rknpu2/python/README_CN.md
new file mode 100644
index 00000000000..fd539f70875
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/python/README_CN.md
@@ -0,0 +1,108 @@
+[English](README.md) | 简体中文
+# InsightFace Python部署示例
+
+FastDeploy支持在RKNPU上部署包括ArcFace\CosFace\VPL\Partial_FC在内的InsightFace系列模型。
+
+本目录下提供`infer_arcface.py`快速完成InsighFace模型包括ArcFace在CPU/RKNPU加速部署的示例。
+
+
+在部署前，需确认以下步骤:
+
+- 1. 软硬件环境满足要求，参考[FastDeploy环境要求](../../../../../../docs/cn/build_and_install/rknpu2.md)
+
+```bash
+#下载部署示例代码
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd examples/vision/faceid/insightface/python/
+
+#下载ArcFace模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/ms1mv3_arcface_r100.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/rknpu2/face_demo.zip
+unzip face_demo.zip
+
+# CPU推理
+python infer_arcface.py --model ms1mv3_arcface_r100.onnx \
+                        --face face_0.jpg \
+                        --face_positive face_1.jpg \
+                        --face_negative face_2.jpg \
+                        --device cpu
+# GPU推理
+python infer_arcface.py --model ms1mv3_arcface_r100.onnx \
+                        --face face_0.jpg \
+                        --face_positive face_1.jpg \
+                        --face_negative face_2.jpg \
+                        --device gpu
+```
+
+运行完成可视化结果如下图所示
+
+<div width="700">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321537-860bf857-0101-4e92-a74c-48e8658d838c.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184322004-a551e6e4-6f47-454e-95d6-f8ba2f47b516.JPG">
+<img width="220" float="left" src="https://user-images.githubusercontent.com/67993288/184321622-d9a494c3-72f3-47f1-97c5-8a2372de491f.JPG">
+</div>
+
+```bash
+Prediction Done!
+--- [Face 0]:FaceRecognitionResult: [Dim(512), Min(-2.309220), Max(2.372197), Mean(0.016987)]
+--- [Face 1]:FaceRecognitionResult: [Dim(512), Min(-2.288258), Max(1.995104), Mean(-0.003400)]
+--- [Face 2]:FaceRecognitionResult: [Dim(512), Min(-3.243411), Max(3.875866), Mean(-0.030682)]
+Detect Done! Cosine 01: 0.814385, Cosine 02:-0.059388
+
+```
+
+## InsightFace Python接口
+
+```python
+fastdeploy.vision.faceid.ArcFace(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+fastdeploy.vision.faceid.CosFace(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+fastdeploy.vision.faceid.PartialFC(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+fastdeploy.vision.faceid.VPL(model_file, params_file=None, runtime_option=None, model_format=ModelFormat.ONNX)
+```
+
+ArcFace模型加载和初始化，其中model_file为导出的ONNX模型格式
+
+**参数**
+
+> * **model_file**(str): 模型文件路径
+> * **params_file**(str): 参数文件路径，当模型格式为ONNX格式时，此参数无需设定
+> * **runtime_option**(RuntimeOption): 后端推理配置，默认为None，即采用默认配置
+> * **model_format**(ModelFormat): 模型格式，默认为ONNX
+
+### predict函数
+
+> ```python
+> ArcFace.predict(image_data)
+> ```
+>
+> 模型预测结口，输入图像直接输出检测结果。
+>
+> **参数**
+>
+> > * **image_data**(np.ndarray): 输入数据，注意需为HWC，BGR格式
+
+> **返回**
+>
+> > 返回`fastdeploy.vision.FaceRecognitionResult`结构体，结构体说明参考文档[视觉模型预测结果](../../../../../docs/api/vision_results/)
+
+### 类成员属性
+#### 预处理参数
+用户可按照自己的实际需求，修改下列预处理参数，从而影响最终的推理和部署效果
+
+#### AdaFacePreprocessor的成员变量
+以下变量为AdaFacePreprocessor的成员变量
+> > * **size**(list[int]): 通过此参数修改预处理过程中resize的大小，包含两个整型元素，表示[width, height], 默认值为[112, 112]
+> > * **alpha**(list[float]): 预处理归一化的alpha值，计算公式为`x'=x*alpha+beta`，alpha默认为[1. / 127.5, 1.f / 127.5, 1. / 127.5]
+> > * **beta**(list[float]): 预处理归一化的beta值，计算公式为`x'=x*alpha+beta`，beta默认为[-1.f, -1.f, -1.f]
+
+#### AdaFacePostprocessor的成员变量
+以下变量为AdaFacePostprocessor的成员变量
+> > * **l2_normalize**(bool): 输出人脸向量之前是否执行l2归一化，默认False
+
+
+## 其它文档
+
+- [InsightFace 模型介绍](..)
+- [InsightFace C++部署](../cpp)
+- [模型预测结果说明](../../../../../docs/api/vision_results/)
+- [如何切换模型推理后端引擎](../../../../../docs/cn/faq/how_to_change_backend.md)
diff --git a/examples/vision/faceid/insightface/rknpu2/python/infer_arcface.py b/examples/vision/faceid/insightface/rknpu2/python/infer_arcface.py
new file mode 100644
index 00000000000..90222a27273
--- /dev/null
+++ b/examples/vision/faceid/insightface/rknpu2/python/infer_arcface.py
@@ -0,0 +1,76 @@
+import fastdeploy as fd
+import cv2
+import numpy as np
+
+
+def cosine_similarity(a, b):
+    a = np.array(a)
+    b = np.array(b)
+    mul_a = np.linalg.norm(a, ord=2)
+    mul_b = np.linalg.norm(b, ord=2)
+    mul_ab = np.dot(a, b)
+    return mul_ab / (mul_a * mul_b)
+
+
+def parse_arguments():
+    import argparse
+    import ast
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="Path of insgihtface onnx model.")
+    parser.add_argument(
+        "--face", required=True, help="Path of test face image file.")
+    parser.add_argument(
+        "--face_positive",
+        required=True,
+        help="Path of test face_positive image file.")
+    parser.add_argument(
+        "--face_negative",
+        required=True,
+        help="Path of test face_negative image file.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default='cpu',
+        help="Type of inference device, support 'cpu' or 'gpu'.")
+    return parser.parse_args()
+
+
+def build_option(args):
+    option = fd.RuntimeOption()
+
+    if args.device.lower() == "npu":
+        option.use_rknpu2()
+    return option
+
+
+args = parse_arguments()
+
+runtime_option = fd.RuntimeOption()
+model = fd.vision.faceid.ArcFace(args.model, runtime_option=runtime_option)
+if args.device.lower() == "npu":
+    runtime_option.use_rknpu2()
+    model.preprocessor.disable_normalize()
+    model.preprocessor.disable_permute()
+
+face0 = cv2.imread(args.face)
+face1 = cv2.imread(args.face_positive)
+face2 = cv2.imread(args.face_negative)
+
+result0 = model.predict(face0)
+result1 = model.predict(face1)
+result2 = model.predict(face2)
+
+embedding0 = result0.embedding
+embedding1 = result1.embedding
+embedding2 = result2.embedding
+
+cosine01 = cosine_similarity(embedding0, embedding1)
+cosine02 = cosine_similarity(embedding0, embedding2)
+
+print(result0, end="")
+print(result1, end="")
+print(result2, end="")
+print("Cosine 01: ", cosine01)
+print("Cosine 02: ", cosine02)
+print(model.runtime_option)
diff --git a/fastdeploy/vision/detection/ppdet/model.h b/fastdeploy/vision/detection/ppdet/model.h
index a3797bdb8b4..1a33c477138 100755
--- a/fastdeploy/vision/detection/ppdet/model.h
+++ b/fastdeploy/vision/detection/ppdet/model.h
@@ -253,7 +253,7 @@ class FASTDEPLOY_DECL PaddleYOLOv8 : public PPDetBase {
                const ModelFormat& model_format = ModelFormat::PADDLE)
       : PPDetBase(model_file, params_file, config_file, custom_option,
                   model_format) {
-    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT, Backend::PDINFER};
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT, Backend::PDINFER, Backend::LITE};
     valid_gpu_backends = {Backend::ORT, Backend::PDINFER, Backend::TRT};
     valid_kunlunxin_backends = {Backend::LITE};
     initialized = Initialize();
diff --git a/fastdeploy/vision/faceid/contrib/insightface/base.cc b/fastdeploy/vision/faceid/contrib/insightface/base.cc
old mode 100755
new mode 100644
index 35d8b808609..8b970cb82c0
--- a/fastdeploy/vision/faceid/contrib/insightface/base.cc
+++ b/fastdeploy/vision/faceid/contrib/insightface/base.cc
@@ -22,7 +22,6 @@ InsightFaceRecognitionBase::InsightFaceRecognitionBase(
     const std::string& model_file, const std::string& params_file,
     const fastdeploy::RuntimeOption& custom_option,
     const fastdeploy::ModelFormat& model_format) {
-
   if (model_format == ModelFormat::ONNX) {
     valid_cpu_backends = {Backend::ORT};
     valid_gpu_backends = {Backend::ORT, Backend::TRT};
@@ -31,6 +30,7 @@ InsightFaceRecognitionBase::InsightFaceRecognitionBase(
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
     valid_kunlunxin_backends = {Backend::LITE};
   }
+  valid_rknpu_backends = {Backend::RKNPU2};
   runtime_option = custom_option;
   runtime_option.model_format = model_format;
   runtime_option.model_file = model_file;
@@ -55,8 +55,9 @@ bool InsightFaceRecognitionBase::Predict(const cv::Mat& im,
   return true;
 }
 
-bool InsightFaceRecognitionBase::BatchPredict(const std::vector<cv::Mat>& images,
-                                              std::vector<FaceRecognitionResult>* results){
+bool InsightFaceRecognitionBase::BatchPredict(
+    const std::vector<cv::Mat>& images,
+    std::vector<FaceRecognitionResult>* results) {
   std::vector<FDMat> fd_images = WrapMat(images);
   FDASSERT(images.size() == 1, "Only support batch = 1 now.");
   if (!preprocessor_.Run(&fd_images, &reused_input_tensors_)) {
@@ -70,8 +71,9 @@ bool InsightFaceRecognitionBase::BatchPredict(const std::vector<cv::Mat>& images
     return false;
   }
 
-  if (!postprocessor_.Run(reused_output_tensors_, results)){
-    FDERROR << "Failed to postprocess the inference results by runtime." << std::endl;
+  if (!postprocessor_.Run(reused_output_tensors_, results)) {
+    FDERROR << "Failed to postprocess the inference results by runtime."
+            << std::endl;
     return false;
   }
   return true;
diff --git a/fastdeploy/vision/faceid/contrib/insightface/insightface_pybind.cc b/fastdeploy/vision/faceid/contrib/insightface/insightface_pybind.cc
index b193d9fb795..fa0e2babf67 100644
--- a/fastdeploy/vision/faceid/contrib/insightface/insightface_pybind.cc
+++ b/fastdeploy/vision/faceid/contrib/insightface/insightface_pybind.cc
@@ -19,83 +19,120 @@ void BindInsightFace(pybind11::module& m) {
   pybind11::class_<vision::faceid::InsightFaceRecognitionPreprocessor>(
       m, "InsightFaceRecognitionPreprocessor")
       .def(pybind11::init())
-      .def("run", [](vision::faceid::InsightFaceRecognitionPreprocessor& self,
-                     std::vector<pybind11::array>& im_list) {
-        std::vector<vision::FDMat> images;
-        for (size_t i = 0; i < im_list.size(); ++i) {
-          images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
-        }
-        std::vector<FDTensor> outputs;
-        if (!self.Run(&images, &outputs)) {
-          throw std::runtime_error("Failed to preprocess the input data in InsightFaceRecognitionPreprocessor.");
-        }
-        for (size_t i = 0; i < outputs.size(); ++i) {
-          outputs[i].StopSharing();
-        }
-        return outputs;
-      })
-      .def_property("permute", &vision::faceid::InsightFaceRecognitionPreprocessor::GetPermute,
-                    &vision::faceid::InsightFaceRecognitionPreprocessor::SetPermute)
-      .def_property("alpha", &vision::faceid::InsightFaceRecognitionPreprocessor::GetAlpha,
-                    &vision::faceid::InsightFaceRecognitionPreprocessor::SetAlpha)
-      .def_property("beta", &vision::faceid::InsightFaceRecognitionPreprocessor::GetBeta,
-                    &vision::faceid::InsightFaceRecognitionPreprocessor::SetBeta)
-      .def_property("size", &vision::faceid::InsightFaceRecognitionPreprocessor::GetSize,
-                    &vision::faceid::InsightFaceRecognitionPreprocessor::SetSize);
+      .def("run",
+           [](vision::faceid::InsightFaceRecognitionPreprocessor& self,
+              std::vector<pybind11::array>& im_list) {
+             std::vector<vision::FDMat> images;
+             for (size_t i = 0; i < im_list.size(); ++i) {
+               images.push_back(vision::WrapMat(PyArrayToCvMat(im_list[i])));
+             }
+             std::vector<FDTensor> outputs;
+             if (!self.Run(&images, &outputs)) {
+               throw std::runtime_error(
+                   "Failed to preprocess the input data in "
+                   "InsightFaceRecognitionPreprocessor.");
+             }
+             for (size_t i = 0; i < outputs.size(); ++i) {
+               outputs[i].StopSharing();
+             }
+             return outputs;
+           })
+      .def(
+          "disable_normalize",
+          &vision::faceid::InsightFaceRecognitionPreprocessor::DisableNormalize)
+      .def("disable_permute",
+           &vision::faceid::InsightFaceRecognitionPreprocessor::DisablePermute)
+      .def_property(
+          "alpha",
+          &vision::faceid::InsightFaceRecognitionPreprocessor::GetAlpha,
+          &vision::faceid::InsightFaceRecognitionPreprocessor::SetAlpha)
+      .def_property(
+          "beta", &vision::faceid::InsightFaceRecognitionPreprocessor::GetBeta,
+          &vision::faceid::InsightFaceRecognitionPreprocessor::SetBeta)
+      .def_property(
+          "size", &vision::faceid::InsightFaceRecognitionPreprocessor::GetSize,
+          &vision::faceid::InsightFaceRecognitionPreprocessor::SetSize);
 
   pybind11::class_<vision::faceid::InsightFaceRecognitionPostprocessor>(
       m, "InsightFaceRecognitionPostprocessor")
       .def(pybind11::init())
-      .def("run", [](vision::faceid::InsightFaceRecognitionPostprocessor& self, std::vector<FDTensor>& inputs) {
-        std::vector<vision::FaceRecognitionResult> results;
-        if (!self.Run(inputs, &results)) {
-          throw std::runtime_error("Failed to postprocess the runtime result in InsightFaceRecognitionPostprocessor.");
-        }
-        return results;
-      })
-      .def("run", [](vision::faceid::InsightFaceRecognitionPostprocessor& self, std::vector<pybind11::array>& input_array) {
-        std::vector<vision::FaceRecognitionResult> results;
-        std::vector<FDTensor> inputs;
-        PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
-        if (!self.Run(inputs, &results)) {
-          throw std::runtime_error("Failed to postprocess the runtime result in InsightFaceRecognitionPostprocessor.");
-        }
-        return results;
-      })
-      .def_property("l2_normalize", &vision::faceid::InsightFaceRecognitionPostprocessor::GetL2Normalize,
-                    &vision::faceid::InsightFaceRecognitionPostprocessor::SetL2Normalize);
+      .def("run",
+           [](vision::faceid::InsightFaceRecognitionPostprocessor& self,
+              std::vector<FDTensor>& inputs) {
+             std::vector<vision::FaceRecognitionResult> results;
+             if (!self.Run(inputs, &results)) {
+               throw std::runtime_error(
+                   "Failed to postprocess the runtime result in "
+                   "InsightFaceRecognitionPostprocessor.");
+             }
+             return results;
+           })
+      .def("run",
+           [](vision::faceid::InsightFaceRecognitionPostprocessor& self,
+              std::vector<pybind11::array>& input_array) {
+             std::vector<vision::FaceRecognitionResult> results;
+             std::vector<FDTensor> inputs;
+             PyArrayToTensorList(input_array, &inputs, /*share_buffer=*/true);
+             if (!self.Run(inputs, &results)) {
+               throw std::runtime_error(
+                   "Failed to postprocess the runtime result in "
+                   "InsightFaceRecognitionPostprocessor.");
+             }
+             return results;
+           })
+      .def_property(
+          "l2_normalize",
+          &vision::faceid::InsightFaceRecognitionPostprocessor::GetL2Normalize,
+          &vision::faceid::InsightFaceRecognitionPostprocessor::SetL2Normalize);
 
   pybind11::class_<vision::faceid::InsightFaceRecognitionBase, FastDeployModel>(
       m, "InsightFaceRecognitionBase")
-      .def(pybind11::init<std::string, std::string, RuntimeOption, ModelFormat>())
-      .def("predict", [](vision::faceid::InsightFaceRecognitionBase& self, pybind11::array& data) {
-        cv::Mat im = PyArrayToCvMat(data);
-        vision::FaceRecognitionResult result;
-        self.Predict(im, &result);
-        return result;
-      })
-      .def("batch_predict", [](vision::faceid::InsightFaceRecognitionBase& self, std::vector<pybind11::array>& data) {
-        std::vector<cv::Mat> images;
-        for (size_t i = 0; i < data.size(); ++i) {
-          images.push_back(PyArrayToCvMat(data[i]));
-        }
-        std::vector<vision::FaceRecognitionResult> results;
-        self.BatchPredict(images, &results);
-        return results;
-      })
-      .def_property_readonly("preprocessor", &vision::faceid::InsightFaceRecognitionBase::GetPreprocessor)
-      .def_property_readonly("postprocessor", &vision::faceid::InsightFaceRecognitionBase::GetPostprocessor);
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>())
+      .def("predict",
+           [](vision::faceid::InsightFaceRecognitionBase& self,
+              pybind11::array& data) {
+             cv::Mat im = PyArrayToCvMat(data);
+             vision::FaceRecognitionResult result;
+             self.Predict(im, &result);
+             return result;
+           })
+      .def("batch_predict",
+           [](vision::faceid::InsightFaceRecognitionBase& self,
+              std::vector<pybind11::array>& data) {
+             std::vector<cv::Mat> images;
+             for (size_t i = 0; i < data.size(); ++i) {
+               images.push_back(PyArrayToCvMat(data[i]));
+             }
+             std::vector<vision::FaceRecognitionResult> results;
+             self.BatchPredict(images, &results);
+             return results;
+           })
+      .def_property_readonly(
+          "preprocessor",
+          &vision::faceid::InsightFaceRecognitionBase::GetPreprocessor)
+      .def_property_readonly(
+          "postprocessor",
+          &vision::faceid::InsightFaceRecognitionBase::GetPostprocessor);
 
-  pybind11::class_<vision::faceid::ArcFace, vision::faceid::InsightFaceRecognitionBase>(m, "ArcFace")
-      .def(pybind11::init<std::string, std::string, RuntimeOption,ModelFormat>());
+  pybind11::class_<vision::faceid::ArcFace,
+                   vision::faceid::InsightFaceRecognitionBase>(m, "ArcFace")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>());
 
-  pybind11::class_<vision::faceid::CosFace, vision::faceid::InsightFaceRecognitionBase>(m, "CosFace")
-      .def(pybind11::init<std::string, std::string, RuntimeOption,ModelFormat>());
+  pybind11::class_<vision::faceid::CosFace,
+                   vision::faceid::InsightFaceRecognitionBase>(m, "CosFace")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>());
 
-  pybind11::class_<vision::faceid::PartialFC, vision::faceid::InsightFaceRecognitionBase>(m, "PartialFC")
-      .def(pybind11::init<std::string, std::string, RuntimeOption,ModelFormat>());
+  pybind11::class_<vision::faceid::PartialFC,
+                   vision::faceid::InsightFaceRecognitionBase>(m, "PartialFC")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>());
 
-  pybind11::class_<vision::faceid::VPL, vision::faceid::InsightFaceRecognitionBase>(m, "VPL")
-      .def(pybind11::init<std::string, std::string, RuntimeOption,ModelFormat>());
+  pybind11::class_<vision::faceid::VPL,
+                   vision::faceid::InsightFaceRecognitionBase>(m, "VPL")
+      .def(pybind11::init<std::string, std::string, RuntimeOption,
+                          ModelFormat>());
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/faceid/contrib/insightface/model.h b/fastdeploy/vision/faceid/contrib/insightface/model.h
index a1a8f128bd6..8ae5c950a9e 100755
--- a/fastdeploy/vision/faceid/contrib/insightface/model.h
+++ b/fastdeploy/vision/faceid/contrib/insightface/model.h
@@ -35,6 +35,8 @@ class FASTDEPLOY_DECL ArcFace : public InsightFaceRecognitionBase {
     if (model_format == ModelFormat::ONNX) {
       valid_cpu_backends = {Backend::ORT};
       valid_gpu_backends = {Backend::ORT, Backend::TRT};
+    } else if (model_format == ModelFormat::RKNN) {
+      valid_rknpu_backends = {Backend::RKNPU2};
     } else {
       valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
       valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -63,6 +65,8 @@ class FASTDEPLOY_DECL CosFace : public InsightFaceRecognitionBase {
     if (model_format == ModelFormat::ONNX) {
       valid_cpu_backends = {Backend::ORT};
       valid_gpu_backends = {Backend::ORT, Backend::TRT};
+    } else if (model_format == ModelFormat::RKNN) {
+      valid_rknpu_backends = {Backend::RKNPU2};
     } else {
       valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
       valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -83,13 +87,15 @@ class FASTDEPLOY_DECL PartialFC : public InsightFaceRecognitionBase {
    * \param[in] model_format Model format of the loaded model, default is Paddle format
    */
   PartialFC(const std::string& model_file, const std::string& params_file = "",
-          const RuntimeOption& custom_option = RuntimeOption(),
-          const ModelFormat& model_format = ModelFormat::ONNX)
+            const RuntimeOption& custom_option = RuntimeOption(),
+            const ModelFormat& model_format = ModelFormat::ONNX)
       : InsightFaceRecognitionBase(model_file, params_file, custom_option,
                                    model_format) {
     if (model_format == ModelFormat::ONNX) {
       valid_cpu_backends = {Backend::ORT};
       valid_gpu_backends = {Backend::ORT, Backend::TRT};
+    } else if (model_format == ModelFormat::RKNN) {
+      valid_rknpu_backends = {Backend::RKNPU2};
     } else {
       valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
       valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
@@ -110,13 +116,15 @@ class FASTDEPLOY_DECL VPL : public InsightFaceRecognitionBase {
    * \param[in] model_format Model format of the loaded model, default is Paddle format
    */
   VPL(const std::string& model_file, const std::string& params_file = "",
-            const RuntimeOption& custom_option = RuntimeOption(),
-            const ModelFormat& model_format = ModelFormat::ONNX)
+      const RuntimeOption& custom_option = RuntimeOption(),
+      const ModelFormat& model_format = ModelFormat::ONNX)
       : InsightFaceRecognitionBase(model_file, params_file, custom_option,
                                    model_format) {
     if (model_format == ModelFormat::ONNX) {
       valid_cpu_backends = {Backend::ORT};
       valid_gpu_backends = {Backend::ORT, Backend::TRT};
+    } else if (model_format == ModelFormat::RKNN) {
+      valid_rknpu_backends = {Backend::RKNPU2};
     } else {
       valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
       valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
diff --git a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
old mode 100755
new mode 100644
index c846522cc1f..398a7016e0b
--- a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
+++ b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.cc
@@ -23,11 +23,10 @@ InsightFaceRecognitionPreprocessor::InsightFaceRecognitionPreprocessor() {
   size_ = {112, 112};
   alpha_ = {1.f / 127.5f, 1.f / 127.5f, 1.f / 127.5f};
   beta_ = {-1.f, -1.f, -1.f};  // RGB
-  permute_ = true;
 }
- 
-bool InsightFaceRecognitionPreprocessor::Preprocess(FDMat * mat, FDTensor* output) {
 
+bool InsightFaceRecognitionPreprocessor::Preprocess(FDMat* mat,
+                                                    FDTensor* output) {
   // face recognition model's preprocess steps in insightface
   // reference: insightface/recognition/arcface_torch/inference.py
   // 1. Resize
@@ -39,13 +38,16 @@ bool InsightFaceRecognitionPreprocessor::Preprocess(FDMat * mat, FDTensor* outpu
   if (resize_h != mat->Height() || resize_w != mat->Width()) {
     Resize::Run(mat, resize_w, resize_h);
   }
-  if (permute_) {
+
+  if (!disable_permute_) {
     BGR2RGB::Run(mat);
   }
 
-  Convert::Run(mat, alpha_, beta_);
-  HWC2CHW::Run(mat);
-  Cast::Run(mat, "float");
+  if (!disable_normalize_) {
+    Convert::Run(mat, alpha_, beta_);
+    HWC2CHW::Run(mat);
+    Cast::Run(mat, "float");
+  }
 
   mat->ShareWithTensor(output);
   output->ExpandDim(0);  // reshape to n, h, w, c
@@ -55,7 +57,8 @@ bool InsightFaceRecognitionPreprocessor::Preprocess(FDMat * mat, FDTensor* outpu
 bool InsightFaceRecognitionPreprocessor::Run(std::vector<FDMat>* images,
                                              std::vector<FDTensor>* outputs) {
   if (images->empty()) {
-    FDERROR << "The size of input images should be greater than 0." << std::endl;
+    FDERROR << "The size of input images should be greater than 0."
+            << std::endl;
     return false;
   }
   FDASSERT(images->size() == 1, "Only support batch = 1 now.");
diff --git a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.h b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.h
index 88d0dce8dca..b73538df49e 100755
--- a/fastdeploy/vision/faceid/contrib/insightface/preprocessor.h
+++ b/fastdeploy/vision/faceid/contrib/insightface/preprocessor.h
@@ -54,10 +54,11 @@ class FASTDEPLOY_DECL InsightFaceRecognitionPreprocessor {
   /// Set beta.
   void SetBeta(std::vector<float>& beta) { beta_ = beta; }
 
-  bool GetPermute() { return permute_; }
+  /// This function will disable normalize and hwc2chw in preprocessing step.
+  void DisableNormalize() { disable_normalize_ = true; }
 
-  /// Set permute.
-  void SetPermute(bool permute) { permute_ = permute; }
+  /// This function will disable hwc2chw in preprocessing step.
+  void DisablePermute() { disable_permute_ = true; }
 
  protected:
   bool Preprocess(FDMat* mat, FDTensor* output);
@@ -70,9 +71,11 @@ class FASTDEPLOY_DECL InsightFaceRecognitionPreprocessor {
   // Argument for image preprocessing step, beta values for normalization,
   // default beta = {-1.f, -1.f, -1.f}
   std::vector<float> beta_;
+  // for recording the switch of normalize
+  bool disable_normalize_ = false;
   // Argument for image preprocessing step, whether to swap the B and R channel,
   // such as BGR->RGB, default true.
-  bool permute_;
+  bool disable_permute_ = false;
 };
 
 }  // namespace faceid
diff --git a/fastdeploy/vision/utils/face_align.cc b/fastdeploy/vision/utils/face_align.cc
new file mode 100644
index 00000000000..63dcc43972d
--- /dev/null
+++ b/fastdeploy/vision/utils/face_align.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// reference:
+// https://github.com/deepinsight/insightface/blob/master/recognition/_tools_/cpp_align/face_align.h
+#include "fastdeploy/vision/utils/utils.h"
+
+namespace fastdeploy {
+namespace vision {
+namespace utils {
+
+cv::Mat MeanAxis0(const cv::Mat& src) {
+  int num = src.rows;
+  int dim = src.cols;
+  cv::Mat output(1, dim, CV_32F);
+  for (int i = 0; i < dim; i++) {
+    float sum = 0;
+    for (int j = 0; j < num; j++) {
+      sum += src.at<float>(j, i);
+    }
+    output.at<float>(0, i) = sum / num;
+  }
+  return output;
+}
+
+cv::Mat ElementwiseMinus(const cv::Mat& A, const cv::Mat& B) {
+  cv::Mat output(A.rows, A.cols, A.type());
+  assert(B.cols == A.cols);
+  if (B.cols == A.cols) {
+    for (int i = 0; i < A.rows; i++) {
+      for (int j = 0; j < B.cols; j++) {
+        output.at<float>(i, j) = A.at<float>(i, j) - B.at<float>(0, j);
+      }
+    }
+  }
+  return output;
+}
+
+cv::Mat VarAxis0(const cv::Mat& src) {
+  cv::Mat temp_ = ElementwiseMinus(src, MeanAxis0(src));
+  cv::multiply(temp_, temp_, temp_);
+  return MeanAxis0(temp_);
+}
+
+int MatrixRank(cv::Mat M) {
+  cv::Mat w, u, vt;
+  cv::SVD::compute(M, w, u, vt);
+  cv::Mat1b non_zero_singular_values = w > 0.0001;
+  int rank = countNonZero(non_zero_singular_values);
+  return rank;
+}
+
+cv::Mat SimilarTransform(cv::Mat& dst, cv::Mat& src) {
+  int num = dst.rows;
+  int dim = dst.cols;
+  cv::Mat src_mean = MeanAxis0(dst);
+  cv::Mat dst_mean = MeanAxis0(src);
+  cv::Mat src_demean = ElementwiseMinus(dst, src_mean);
+  cv::Mat dst_demean = ElementwiseMinus(src, dst_mean);
+  cv::Mat A = (dst_demean.t() * src_demean) / static_cast<float>(num);
+  cv::Mat d(dim, 1, CV_32F);
+  d.setTo(1.0f);
+  if (cv::determinant(A) < 0) {
+    d.at<float>(dim - 1, 0) = -1;
+  }
+  cv::Mat T = cv::Mat::eye(dim + 1, dim + 1, CV_32F);
+  cv::Mat U, S, V;
+  cv::SVD::compute(A, S, U, V);
+  int rank = MatrixRank(A);
+  if (rank == 0) {
+    assert(rank == 0);
+  } else if (rank == dim - 1) {
+    if (cv::determinant(U) * cv::determinant(V) > 0) {
+      T.rowRange(0, dim).colRange(0, dim) = U * V;
+    } else {
+      int s = d.at<float>(dim - 1, 0) = -1;
+      d.at<float>(dim - 1, 0) = -1;
+
+      T.rowRange(0, dim).colRange(0, dim) = U * V;
+      cv::Mat diag_ = cv::Mat::diag(d);
+      cv::Mat twp = diag_ * V;  // np.dot(np.diag(d), V.T)
+      cv::Mat B = cv::Mat::zeros(3, 3, CV_8UC1);
+      cv::Mat C = B.diag(0);
+      T.rowRange(0, dim).colRange(0, dim) = U * twp;
+      d.at<float>(dim - 1, 0) = s;
+    }
+  } else {
+    cv::Mat diag_ = cv::Mat::diag(d);
+    cv::Mat twp = diag_ * V.t();  // np.dot(np.diag(d), V.T)
+    cv::Mat res = U * twp;        // U
+    T.rowRange(0, dim).colRange(0, dim) = -U.t() * twp;
+  }
+  cv::Mat var_ = VarAxis0(src_demean);
+  float val = cv::sum(var_).val[0];
+  cv::Mat res;
+  cv::multiply(d, S, res);
+  float scale = 1.0 / val * cv::sum(res).val[0];
+  T.rowRange(0, dim).colRange(0, dim) =
+      -T.rowRange(0, dim).colRange(0, dim).t();
+  cv::Mat temp1 = T.rowRange(0, dim).colRange(0, dim);  // T[:dim, :dim]
+  cv::Mat temp2 = src_mean.t();                         // src_mean.T
+  cv::Mat temp3 = temp1 * temp2;  // np.dot(T[:dim, :dim], src_mean.T)
+  cv::Mat temp4 = scale * temp3;
+  T.rowRange(0, dim).colRange(dim, dim + 1) = -(temp4 - dst_mean.t());
+  T.rowRange(0, dim).colRange(0, dim) *= scale;
+  return T;
+}
+
+std::vector<cv::Mat> AlignFaceWithFivePoints(
+    cv::Mat& image, FaceDetectionResult& result,
+    std::vector<std::array<float, 2>> std_landmarks,
+    std::array<int, 2> output_size) {
+  FDASSERT(std_landmarks.size() == 5, "The landmarks.size() must be 5.")
+  FDASSERT(!image.empty(), "The input_image can't be empty.")
+  std::vector<cv::Mat> output_images(result.boxes.size());
+  if (result.boxes.empty()) {
+    FDWARNING << "The result is empty." << std::endl;
+    return output_images;
+  }
+
+  cv::Mat src(5, 2, CV_32FC1, std_landmarks.data());
+  for (int i = 0; i < result.landmarks.size(); i += 5) {
+    cv::Mat dst(5, 2, CV_32FC1, result.landmarks.data() + i);
+    cv::Mat m = SimilarTransform(dst, src);
+    cv::Mat map_matrix;
+    cv::Rect map_matrix_r = cv::Rect(0, 0, 3, 2);
+    cv::Mat(m, map_matrix_r).copyTo(map_matrix);
+    cv::Mat cropped_image_aligned;
+    cv::warpAffine(image, cropped_image_aligned, map_matrix,
+                   {output_size[0], output_size[1]});
+    if (cropped_image_aligned.empty()) {
+      FDWARNING << "croppedImageAligned is empty." << std::endl;
+    }
+    output_images.push_back(cropped_image_aligned);
+  }
+  return output_images;
+}
+}  // namespace utils
+}  // namespace vision
+}  // namespace fastdeploy
diff --git a/fastdeploy/vision/utils/utils.h b/fastdeploy/vision/utils/utils.h
index c36d8d0369a..9f5106c4adb 100644
--- a/fastdeploy/vision/utils/utils.h
+++ b/fastdeploy/vision/utils/utils.h
@@ -70,16 +70,32 @@ void SortDetectionResult(DetectionResult* output);
 void SortDetectionResult(FaceDetectionResult* result);
 
 // L2 Norm / cosine similarity  (for face recognition, ...)
-FASTDEPLOY_DECL std::vector<float> L2Normalize(
-    const std::vector<float>& values);
+FASTDEPLOY_DECL std::vector<float>
+L2Normalize(const std::vector<float>& values);
 
 FASTDEPLOY_DECL float CosineSimilarity(const std::vector<float>& a,
                                        const std::vector<float>& b,
                                        bool normalized = true);
 
-bool CropImageByBox(Mat& src_im, Mat* dst_im,
-                    const std::vector<float>& box, std::vector<float>* center,
-                    std::vector<float>* scale, const float expandratio = 0.3);
+/** \brief Do face align for model with five points.
+   *
+   * \param[in] image The original image
+   * \param[in] result FaceDetectionResult
+   * \param[in] std_landmarks Standard face template
+   * \param[in] output_size The size of output mat
+   */
+FASTDEPLOY_DECL std::vector<cv::Mat> AlignFaceWithFivePoints(
+    cv::Mat& image, FaceDetectionResult& result,
+    std::vector<std::array<float, 2>> std_landmarks = {{38.2946f, 51.6963f},
+                                                       {73.5318f, 51.5014f},
+                                                       {56.0252f, 71.7366f},
+                                                       {41.5493f, 92.3655f},
+                                                       {70.7299f, 92.2041f}},
+    std::array<int, 2> output_size = {112, 112});
+
+bool CropImageByBox(Mat& src_im, Mat* dst_im, const std::vector<float>& box,
+                    std::vector<float>* center, std::vector<float>* scale,
+                    const float expandratio = 0.3);
 
 /**
  * Function: for keypoint detection model, fine positioning of keypoints in
diff --git a/python/fastdeploy/vision/faceid/contrib/insightface/__init__.py b/python/fastdeploy/vision/faceid/contrib/insightface/__init__.py
index 3353c8e4408..dd8cab5e20d 100644
--- a/python/fastdeploy/vision/faceid/contrib/insightface/__init__.py
+++ b/python/fastdeploy/vision/faceid/contrib/insightface/__init__.py
@@ -56,13 +56,17 @@ def beta(self):
         """
         return self._preprocessor.beta
 
-    @property
-    def permute(self):
+    def disable_normalize(self):
+        """
+        This function will disable normalize in preprocessing step.
+        """
+        self._preprocessor.disable_normalize()
+
+    def disable_permute(self):
         """
-        Argument for image preprocessing step, whether to swap the B and R channel,
-        such as BGR->RGB, default true.
+        This function will disable hwc2chw in preprocessing step.
         """
-        return self._preprocessor.permute
+        self._preprocessor.disable_permute()
 
 
 class InsightFaceRecognitionPostprocessor:
diff --git a/tools/rknpu2/config/arcface_quantized.yaml b/tools/rknpu2/config/arcface_quantized.yaml
new file mode 100644
index 00000000000..95642b5c9c4
--- /dev/null
+++ b/tools/rknpu2/config/arcface_quantized.yaml
@@ -0,0 +1,15 @@
+mean:
+  -
+    - 127.5
+    - 127.5
+    - 127.5
+std:
+  -
+    - 127.5
+    - 127.5
+    - 127.5
+model_path: ./ms1mv3_arcface_r18/ms1mv3_arcface_r18.onnx
+outputs_nodes:
+do_quantization: True
+dataset: "./ms1mv3_arcface_r18/datasets.txt"
+output_folder: "./ms1mv3_arcface_r18"
diff --git a/tools/rknpu2/config/arcface_unquantized.yaml b/tools/rknpu2/config/arcface_unquantized.yaml
new file mode 100644
index 00000000000..c11b285d362
--- /dev/null
+++ b/tools/rknpu2/config/arcface_unquantized.yaml
@@ -0,0 +1,15 @@
+mean:
+  -
+    - 127.5
+    - 127.5
+    - 127.5
+std:
+  -
+    - 127.5
+    - 127.5
+    - 127.5
+model_path: ./ms1mv3_arcface_r18/ms1mv3_arcface_r18.onnx
+outputs_nodes:
+do_quantization: False
+dataset: "./ms1mv3_arcface_r18/datasets.txt"
+output_folder: "./ms1mv3_arcface_r18"