From 7cfe6e6dd50073023dac115788164d4733a0eeeb Mon Sep 17 00:00:00 2001 From: Chen Xin Date: Sat, 2 Apr 2022 22:35:11 +0800 Subject: [PATCH 01/51] fix pose demo and windows build (#307) --- csrc/codebase/mmpose/CMakeLists.txt | 3 ++- csrc/codebase/mmpose/mmpose.h | 2 ++ demo/csrc/pose_detection.cpp | 3 +-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/codebase/mmpose/CMakeLists.txt b/csrc/codebase/mmpose/CMakeLists.txt index 6d4c7dd562..ae58ce91ba 100644 --- a/csrc/codebase/mmpose/CMakeLists.txt +++ b/csrc/codebase/mmpose/CMakeLists.txt @@ -7,5 +7,6 @@ include(${CMAKE_SOURCE_DIR}/cmake/MMDeploy.cmake) file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp") mmdeploy_add_module(${PROJECT_NAME} "${SRCS}") -target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils) +target_link_libraries(${PROJECT_NAME} PRIVATE + mmdeploy::transform mmdeploy_opencv_utils) add_library(mmdeploy::mmpose ALIAS ${PROJECT_NAME}) diff --git a/csrc/codebase/mmpose/mmpose.h b/csrc/codebase/mmpose/mmpose.h index ed66f53a8e..a658d48947 100644 --- a/csrc/codebase/mmpose/mmpose.h +++ b/csrc/codebase/mmpose/mmpose.h @@ -3,6 +3,8 @@ #ifndef MMDEPLOY_MMPOSE_H #define MMDEPLOY_MMPOSE_H +#include + #include "codebase/common.h" #include "core/device.h" #include "core/module.h" diff --git a/demo/csrc/pose_detection.cpp b/demo/csrc/pose_detection.cpp index 14fa9c7391..253e965a8a 100644 --- a/demo/csrc/pose_detection.cpp +++ b/demo/csrc/pose_detection.cpp @@ -31,8 +31,7 @@ int main(int argc, char *argv[]) { mm_mat_t mat{img.data, img.rows, img.cols, 3, MM_BGR, MM_INT8}; mm_pose_detect_t *res{}; - int *res_count{}; - status = mmdeploy_pose_detector_apply(pose_estimator, &mat, 1, &res, &res_count); + status = mmdeploy_pose_detector_apply(pose_estimator, &mat, 1, &res); if (status != MM_SUCCESS) { fprintf(stderr, "failed to apply pose estimator, code: %d\n", (int)status); return 1; From 442e9cd7de85bc4868a0b48225c07895f7c07352 Mon Sep 17 00:00:00 2001 From: Shengxi Li <982783556@qq.com> Date: Wed, 6 Apr 2022 19:45:15 +0800 Subject: [PATCH 02/51] add postprocessing_masks gpu version (#276) * add postprocessing_masks gpu version * default device cpu * pre-commit fix Co-authored-by: hadoop-basecv --- .../mmdet/deploy/object_detection_model.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/mmdeploy/codebase/mmdet/deploy/object_detection_model.py b/mmdeploy/codebase/mmdet/deploy/object_detection_model.py index b368d10972..51f2b3cc80 100644 --- a/mmdeploy/codebase/mmdet/deploy/object_detection_model.py +++ b/mmdeploy/codebase/mmdet/deploy/object_detection_model.py @@ -60,6 +60,7 @@ def __init__(self, backend: Backend, backend_files: Sequence[str], super().__init__(deploy_cfg=deploy_cfg) self.CLASSES = class_names self.deploy_cfg = deploy_cfg + self.device = device self._init_wrapper( backend=backend, backend_files=backend_files, device=device) @@ -114,6 +115,7 @@ def postprocessing_masks(det_bboxes: np.ndarray, det_masks: np.ndarray, img_w: int, img_h: int, + device: str = 'cpu', mask_thr_binary: float = 0.5) -> np.ndarray: """Additional processing of masks. Resizes masks from [num_det, 28, 28] to [num_det, img_w, img_h]. Analog of the 'mmdeploy.codebase.mmdet. @@ -138,8 +140,8 @@ def postprocessing_masks(det_bboxes: np.ndarray, return np.zeros((0, img_h, img_w)) if isinstance(masks, np.ndarray): - masks = torch.tensor(masks) - bboxes = torch.tensor(bboxes) + masks = torch.tensor(masks, device=torch.device(device)) + bboxes = torch.tensor(bboxes, device=torch.device(device)) result_masks = [] for bbox, mask in zip(bboxes, masks): @@ -147,8 +149,16 @@ def postprocessing_masks(det_bboxes: np.ndarray, x0_int, y0_int = 0, 0 x1_int, y1_int = img_w, img_h - img_y = torch.arange(y0_int, y1_int, dtype=torch.float32) + 0.5 - img_x = torch.arange(x0_int, x1_int, dtype=torch.float32) + 0.5 + img_y = torch.arange( + y0_int, + y1_int, + dtype=torch.float32, + device=torch.device(device)) + 0.5 + img_x = torch.arange( + x0_int, + x1_int, + dtype=torch.float32, + device=torch.device(device)) + 0.5 x0, y0, x1, y1 = bbox img_y = (img_y - y0) / (y1 - y0) * 2 - 1 @@ -169,10 +179,8 @@ def postprocessing_masks(det_bboxes: np.ndarray, grid[None, :, :, :], align_corners=False) - mask = img_masks - mask = (mask >= mask_thr_binary).to(dtype=torch.bool) - result_masks.append(mask.numpy()) - result_masks = np.concatenate(result_masks, axis=1) + result_masks.append(img_masks) + result_masks = torch.cat(result_masks, 1) return result_masks.squeeze(0) def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], @@ -206,6 +214,8 @@ def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], if isinstance(scale_factor, (list, tuple, np.ndarray)): assert len(scale_factor) == 4 scale_factor = np.array(scale_factor)[None, :] # [1,4] + scale_factor = torch.from_numpy(scale_factor).to( + device=torch.device(self.device)) dets[:, :4] /= scale_factor if 'border' in img_metas[i]: @@ -216,7 +226,7 @@ def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], y_off = img_metas[i]['border'][0] dets[:, [0, 2]] -= x_off dets[:, [1, 3]] -= y_off - dets[:, :4] *= (dets[:, :4] > 0).astype(dets.dtype) + dets[:, :4] *= (dets[:, :4] > 0) dets_results = bbox2result(dets, labels, len(self.CLASSES)) @@ -234,16 +244,14 @@ def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], 'export_postprocess_mask', True) if not export_postprocess_mask: masks = End2EndModel.postprocessing_masks( - dets[:, :4], masks, ori_w, ori_h) + dets[:, :4], masks, ori_w, ori_h, self.device) else: masks = masks[:, :img_h, :img_w] # avoid to resize masks with zero dim if rescale and masks.shape[0] != 0: - masks = masks.astype(np.float32) - masks = torch.from_numpy(masks) masks = torch.nn.functional.interpolate( masks.unsqueeze(0), size=(ori_h, ori_w)) - masks = masks.squeeze(0).detach().numpy() + masks = masks.squeeze(0) if masks.dtype != bool: masks = masks >= 0.5 segms_results = [[] for _ in range(len(self.CLASSES))] @@ -267,7 +275,6 @@ def forward_test(self, imgs: torch.Tensor, *args, **kwargs) -> \ """ outputs = self.wrapper({self.input_name: imgs}) outputs = self.wrapper.output_to_list(outputs) - outputs = [out.detach().cpu().numpy() for out in outputs] return outputs def show_result(self, From 85c46eefd6c1b419e2395b01788cb421419d04ef Mon Sep 17 00:00:00 2001 From: lzhangzz Date: Thu, 7 Apr 2022 11:11:28 +0800 Subject: [PATCH 03/51] fixed a bug causes text-recognizer to fail when (non-NULL) empty bboxes list is passed (#310) --- csrc/apis/c/text_recognizer.cpp | 59 +++++++++++++++++++++------------ demo/csrc/ocr.cpp | 2 +- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/csrc/apis/c/text_recognizer.cpp b/csrc/apis/c/text_recognizer.cpp index 9458712b54..975a34b2e5 100644 --- a/csrc/apis/c/text_recognizer.cpp +++ b/csrc/apis/c/text_recognizer.cpp @@ -108,15 +108,21 @@ int mmdeploy_text_recognizer_apply_bbox(mm_handle_t handle, const mm_mat_t *imag try { auto recognizer = static_cast(handle); - Value input{Value::kArray, Value::kArray}; + Value::Array input_images; + Value::Array input_bboxes; auto _bboxes = bboxes; auto result_count = 0; - for (int i = 0; i < image_count; ++i) { - mmdeploy::Mat _mat{images[i].height, images[i].width, PixelFormat(images[i].format), - DataType(images->type), images[i].data, Device{"cpu"}}; - input[0].push_back({{"ori_img", _mat}}); + // mapping from image index to result index, -1 represents invalid image with no bboxes + // supplied. + std::vector result_index(image_count, -1); + + for (int i = 0; i < image_count; ++i) { if (bboxes && bbox_count) { + if (bbox_count[i] == 0) { + // skip images with no bounding boxes (push nothing) + continue; + } Value boxes(Value::kArray); for (int j = 0; j < bbox_count[i]; ++j) { Value box; @@ -128,17 +134,26 @@ int mmdeploy_text_recognizer_apply_bbox(mm_handle_t handle, const mm_mat_t *imag } _bboxes += bbox_count[i]; result_count += bbox_count[i]; - input[1].push_back({{"boxes", boxes}}); + input_bboxes.push_back({{"boxes", boxes}}); } else { - input[1].push_back(Value::kNull); + // bboxes or bbox_count not supplied, use whole image result_count += 1; + input_bboxes.push_back(Value::kNull); } + + result_index[i] = static_cast(input_images.size()); + mmdeploy::Mat _mat{images[i].height, images[i].width, PixelFormat(images[i].format), + DataType(images->type), images[i].data, Device{"cpu"}}; + input_images.push_back({{"ori_img", _mat}}); } - auto output = recognizer->Run(std::move(input)).value().front(); + std::vector> recognizer_outputs; - auto recognizer_outputs = - from_value>>(output); + if (!input_images.empty()) { + Value input{std::move(input_images), std::move(input_bboxes)}; + auto output = recognizer->Run(std::move(input)).value().front(); + from_value(output, recognizer_outputs); + } std::vector counts; if (bboxes && bbox_count) { @@ -157,21 +172,23 @@ int mmdeploy_text_recognizer_apply_bbox(mm_handle_t handle, const mm_mat_t *imag new mm_text_recognize_t[result_count]{}, deleter); for (int i = 0; i < image_count; ++i) { - auto &recog_output = recognizer_outputs[i]; - for (int j = 0; j < recog_output.size(); ++j) { - auto &res = _results[offsets[i] + j]; + if (result_index[i] >= 0) { + auto &recog_output = recognizer_outputs[result_index[i]]; + for (int j = 0; j < recog_output.size(); ++j) { + auto &res = _results[offsets[i] + j]; - auto &box_result = recog_output[j]; + auto &box_result = recog_output[j]; - auto &score = box_result.score; - res.length = static_cast(score.size()); + auto &score = box_result.score; + res.length = static_cast(score.size()); - res.score = new float[score.size()]; - std::copy_n(score.data(), score.size(), res.score); + res.score = new float[score.size()]; + std::copy_n(score.data(), score.size(), res.score); - auto text = box_result.text; - res.text = new char[text.length() + 1]; - std::copy_n(text.data(), text.length() + 1, res.text); + auto text = box_result.text; + res.text = new char[text.length() + 1]; + std::copy_n(text.data(), text.length() + 1, res.text); + } } } *results = _results.release(); diff --git a/demo/csrc/ocr.cpp b/demo/csrc/ocr.cpp index 1bb8d43ef2..e2d5c10fde 100644 --- a/demo/csrc/ocr.cpp +++ b/demo/csrc/ocr.cpp @@ -30,7 +30,7 @@ int main(int argc, char *argv[]) { } mm_handle_t text_recognizer{}; - status = mmdeploy_text_recognizer_create_by_path(reg_model_path, "cpu", 0, &text_recognizer); + status = mmdeploy_text_recognizer_create_by_path(reg_model_path, device_name, 0, &text_recognizer); if (status != MM_SUCCESS) { fprintf(stderr, "failed to create text_recognizer, code: %d\n", (int)status); return 1; From 6e7e219b0b232d45aff58a6c8a23ef4fc047ec12 Mon Sep 17 00:00:00 2001 From: lzhangzz Date: Mon, 11 Apr 2022 20:45:10 +0800 Subject: [PATCH 04/51] [Fix] include missing for formatter.h (#313) * fix formatter * relax GCC version requirement --- csrc/core/utils/formatter.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/core/utils/formatter.h b/csrc/core/utils/formatter.h index af28f8c9c0..b1c2280909 100644 --- a/csrc/core/utils/formatter.h +++ b/csrc/core/utils/formatter.h @@ -7,6 +7,8 @@ #if FMT_VERSION >= 50000 #include "spdlog/fmt/bundled/ranges.h" +#else +#include #endif namespace mmdeploy { @@ -36,7 +38,7 @@ inline void format_arg(BasicFormatter &f, const char *, const mmdeploy::Va f.writer() << mmdeploy::format_value(d); } -template >, bool> = true> +template >::value, bool> = true> void format_arg(BasicFormatter &f, const char *, const T &v) { f.writer() << (int)v; } From d7adf815a0e65303b78b226548da8ccf30f7d103 Mon Sep 17 00:00:00 2001 From: Yifan Zhou Date: Thu, 14 Apr 2022 20:25:31 +0800 Subject: [PATCH 05/51] [Fix] MMEditing cannot save results when testing (#336) * fix show * lint * remove redundant codes * resolve comment * type hint --- .../mmedit/deploy/super_resolution_model.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/mmdeploy/codebase/mmedit/deploy/super_resolution_model.py b/mmdeploy/codebase/mmedit/deploy/super_resolution_model.py index ade5d0beea..454de8b951 100644 --- a/mmdeploy/codebase/mmedit/deploy/super_resolution_model.py +++ b/mmdeploy/codebase/mmedit/deploy/super_resolution_model.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Optional, Sequence, Union +import os.path as osp +from typing import Dict, List, Optional, Sequence, Union import mmcv import numpy as np @@ -88,6 +89,8 @@ def forward(self, def forward_test(self, lq: torch.Tensor, gt: Optional[torch.Tensor] = None, + meta: List[Dict] = None, + save_path=None, *args, **kwargs): """Run inference for restorer to generate evaluation result. @@ -96,6 +99,8 @@ def forward_test(self, lq (torch.Tensor): The input low-quality image of the model. gt (torch.Tensor): The ground truth of input image. Defaults to `None`. + meta (List[Dict]): The meta infomations of MMEditing. + save_path (str): Path to save image. Default: None. *args: Other arguments. **kwargs: Other key-pair arguments. @@ -104,6 +109,17 @@ def forward_test(self, """ outputs = self.forward_dummy(lq) result = self.test_post_process(outputs, lq, gt) + + # Align to mmediting BasicRestorer + if save_path: + outputs = [torch.from_numpy(i) for i in outputs] + + lq_path = meta[0]['lq_path'] + folder_name = osp.splitext(osp.basename(lq_path))[0] + save_path = osp.join(save_path, f'{folder_name}.png') + + mmcv.imwrite(tensor2img(outputs), save_path) + return result def forward_dummy(self, lq: torch.Tensor, *args, **kwargs): From 89ce8e20a13aaeb18c9ab05fad2caea747ea0430 Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 14 Apr 2022 22:13:26 +0800 Subject: [PATCH 06/51] docs(build): fix typo (#352) * docs(build): add missing build option * docs(build): add onnx install * style(doc): trim whitespace * docs(build): revert install onnx * docs(build): add ncnn LD_LIBRARY_PATH * docs(build): fix path error --- docs/en/build/linux.md | 1 + docs/en/faq.md | 12 +++++++++++- mmdeploy/core/rewriters/rewriter_utils.py | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/docs/en/build/linux.md b/docs/en/build/linux.md index 72b2cfab7d..318444cf3e 100644 --- a/docs/en/build/linux.md +++ b/docs/en/build/linux.md @@ -235,6 +235,7 @@ Make sure to enable -DNCNN_PYTHON=ON in your build command.

 cd ncnn
 export NCNN_DIR=$(pwd)
+export LD_LIBRARY_PATH=${NCNN_DIR}/build/install/lib/:$LD_LIBRARY_PATH
 
3. Install pyncnn

diff --git a/docs/en/faq.md b/docs/en/faq.md
index b39042b437..14d621ea79 100644
--- a/docs/en/faq.md
+++ b/docs/en/faq.md
@@ -43,7 +43,17 @@
   `python fixNvPe.py --input=C:\Users\user\AppData\Local\Programs\Python\Python38\lib\site-packages\torch\lib\*.dll`
 
    You can find your pytorch installation path with:
-   ``` python
+   ```python
    import torch
    print(torch.__file__)
    ```
+
+### Pip
+- pip installed package but could not `import` them.
+
+  Make sure your are using conda pip.
+  ```bash
+  $ which pip
+  # /path/to/.local/bin/pip
+  /path/to/miniconda3/lib/python3.9/site-packages/pip
+  ```
diff --git a/mmdeploy/core/rewriters/rewriter_utils.py b/mmdeploy/core/rewriters/rewriter_utils.py
index a80fd84738..5d8fd8a830 100644
--- a/mmdeploy/core/rewriters/rewriter_utils.py
+++ b/mmdeploy/core/rewriters/rewriter_utils.py
@@ -181,7 +181,7 @@ def check(self, env: Dict) -> bool:
 
 
 class RewriterRegistry:
-    """A registry that recoreds rewrite objects.
+    """A registry that records rewrite objects.
 
     Logically this class is a two-dimensional table which maintains an object
     list for each backend. The records can be inserted to this table through
@@ -304,7 +304,7 @@ def register_object(self,
             name (str): The import path to access the function/module.
             backend (str): The rewriter will be activated on which backend.
             ir (IR): The rewriter will be activated on which ir.
-            extra_chekcers (None | Checker | List[Checker]): Other requirements
+            extra_checkers (None | Checker | List[Checker]): Other requirements
                 for the rewriters. Default to `None`.
 
         Returns:

From 6aacedef282b459bb3984cb8d94d962f10f84706 Mon Sep 17 00:00:00 2001
From: Chen Xin 
Date: Fri, 15 Apr 2022 10:42:15 +0800
Subject: [PATCH 07/51] fix openvino export tmp model, add binary flag (#353)

---
 csrc/net/openvino/openvino_net.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/net/openvino/openvino_net.cpp b/csrc/net/openvino/openvino_net.cpp
index 1176967c9a..050658f970 100644
--- a/csrc/net/openvino/openvino_net.cpp
+++ b/csrc/net/openvino/openvino_net.cpp
@@ -86,10 +86,10 @@ Result OpenVINONet::Init(const Value& args) {
   OUTCOME_TRY(auto raw_bin, model.ReadFile(config.weights));
 
   try {
-    std::ofstream xml_out(tmp_xml);
+    std::ofstream xml_out(tmp_xml, std::ios::binary);
     xml_out << raw_xml;
     xml_out.close();
-    std::ofstream bin_out(tmp_bin);
+    std::ofstream bin_out(tmp_bin, std::ios::binary);
     bin_out << raw_bin;
     bin_out.close();
   } catch (const std::exception& e) {

From ade8e02047f10db7766b1660c566530b0e2a1237 Mon Sep 17 00:00:00 2001
From: lvhan028 
Date: Fri, 15 Apr 2022 11:10:38 +0800
Subject: [PATCH 08/51] init circleci (#348)

---
 .circleci/config.yml | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 0000000000..9ec7703397
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,39 @@
+# Use the latest 2.1 version of CircleCI pipeline process engine.
+# See: https://circleci.com/docs/2.0/configuration-reference
+version: 2.1
+
+# Define a job to be invoked later in a workflow.
+# See: https://circleci.com/docs/2.0/configuration-reference/#jobs
+jobs:
+  lint:
+    # Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub.
+    # See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor
+    docker:
+      - image: cimg/python:3.7.4
+    # Add steps to the job
+    # See: https://circleci.com/docs/2.0/configuration-reference/#steps
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            sudo apt-add-repository ppa:brightbox/ruby-ng -y
+            sudo apt-get update
+            sudo apt-get install -y ruby2.7
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+      - run:
+          name: Check docstring coverage
+          command: |
+            pip install interrogate
+            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 80 mmdeploy
+
+# Invoke jobs via workflows
+# See: https://circleci.com/docs/2.0/configuration-reference/#workflows
+workflows:
+  pr_stage_test:
+    jobs:
+      - lint

From fdbd3d1a52c32b59e5593ad61d5bda265d6f9eae Mon Sep 17 00:00:00 2001
From: Chen Xin 
Date: Fri, 15 Apr 2022 12:30:55 +0800
Subject: [PATCH 09/51] fix wrong input mat type (#362)

* fix wrong input mat type

* fix lint
---
 csrc/apis/c/classifier.cpp      | 4 ++--
 csrc/apis/c/detector.cpp        | 4 ++--
 csrc/apis/c/pose_detector.cpp   | 4 ++--
 csrc/apis/c/segmentor.cpp       | 4 ++--
 csrc/apis/c/text_recognizer.cpp | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/csrc/apis/c/classifier.cpp b/csrc/apis/c/classifier.cpp
index ecdfaafc87..10b8708df1 100644
--- a/csrc/apis/c/classifier.cpp
+++ b/csrc/apis/c/classifier.cpp
@@ -86,8 +86,8 @@ int mmdeploy_classifier_apply(mm_handle_t handle, const mm_mat_t* mats, int mat_
 
     Value input{Value::kArray};
     for (int i = 0; i < mat_count; ++i) {
-      mmdeploy::Mat _mat{mats[i].height,       mats[i].width, PixelFormat(mats[i].format),
-                         DataType(mats->type), mats[i].data,  Device{"cpu"}};
+      mmdeploy::Mat _mat{mats[i].height,         mats[i].width, PixelFormat(mats[i].format),
+                         DataType(mats[i].type), mats[i].data,  Device{"cpu"}};
       input.front().push_back({{"ori_img", _mat}});
     }
 
diff --git a/csrc/apis/c/detector.cpp b/csrc/apis/c/detector.cpp
index 4dbb573f96..375c6e07a2 100644
--- a/csrc/apis/c/detector.cpp
+++ b/csrc/apis/c/detector.cpp
@@ -85,8 +85,8 @@ int mmdeploy_detector_apply(mm_handle_t handle, const mm_mat_t* mats, int mat_co
 
     Value input{Value::kArray};
     for (int i = 0; i < mat_count; ++i) {
-      mmdeploy::Mat _mat{mats[i].height,       mats[i].width, PixelFormat(mats[i].format),
-                         DataType(mats->type), mats[i].data,  Device{"cpu"}};
+      mmdeploy::Mat _mat{mats[i].height,         mats[i].width, PixelFormat(mats[i].format),
+                         DataType(mats[i].type), mats[i].data,  Device{"cpu"}};
       input.front().push_back({{"ori_img", _mat}});
     }
 
diff --git a/csrc/apis/c/pose_detector.cpp b/csrc/apis/c/pose_detector.cpp
index 6c5ef426ef..acc148ee1b 100644
--- a/csrc/apis/c/pose_detector.cpp
+++ b/csrc/apis/c/pose_detector.cpp
@@ -104,8 +104,8 @@ int mmdeploy_pose_detector_apply_bbox(mm_handle_t handle, const mm_mat_t* mats,
     Value input{Value::kArray};
     auto result_count = 0;
     for (int i = 0; i < mat_count; ++i) {
-      mmdeploy::Mat _mat{mats[i].height,       mats[i].width, PixelFormat(mats[i].format),
-                         DataType(mats->type), mats[i].data,  Device{"cpu"}};
+      mmdeploy::Mat _mat{mats[i].height,         mats[i].width, PixelFormat(mats[i].format),
+                         DataType(mats[i].type), mats[i].data,  Device{"cpu"}};
 
       Value img_with_boxes;
       if (bboxes && bbox_count) {
diff --git a/csrc/apis/c/segmentor.cpp b/csrc/apis/c/segmentor.cpp
index bcdca722a7..1f6ba9750c 100644
--- a/csrc/apis/c/segmentor.cpp
+++ b/csrc/apis/c/segmentor.cpp
@@ -84,8 +84,8 @@ int mmdeploy_segmentor_apply(mm_handle_t handle, const mm_mat_t* mats, int mat_c
 
     Value input{Value::kArray};
     for (int i = 0; i < mat_count; ++i) {
-      mmdeploy::Mat _mat{mats[i].height,       mats[i].width, PixelFormat(mats[i].format),
-                         DataType(mats->type), mats[i].data,  Device{"cpu"}};
+      mmdeploy::Mat _mat{mats[i].height,         mats[i].width, PixelFormat(mats[i].format),
+                         DataType(mats[i].type), mats[i].data,  Device{"cpu"}};
       input.front().push_back({{"ori_img", _mat}});
     }
 
diff --git a/csrc/apis/c/text_recognizer.cpp b/csrc/apis/c/text_recognizer.cpp
index 975a34b2e5..441a7c9423 100644
--- a/csrc/apis/c/text_recognizer.cpp
+++ b/csrc/apis/c/text_recognizer.cpp
@@ -142,8 +142,8 @@ int mmdeploy_text_recognizer_apply_bbox(mm_handle_t handle, const mm_mat_t *imag
       }
 
       result_index[i] = static_cast(input_images.size());
-      mmdeploy::Mat _mat{images[i].height,       images[i].width, PixelFormat(images[i].format),
-                         DataType(images->type), images[i].data,  Device{"cpu"}};
+      mmdeploy::Mat _mat{images[i].height,         images[i].width, PixelFormat(images[i].format),
+                         DataType(images[i].type), images[i].data,  Device{"cpu"}};
       input_images.push_back({{"ori_img", _mat}});
     }
 

From 88062e90a4dd3f4090310d857d8fc0c4f0efc84b Mon Sep 17 00:00:00 2001
From: tpoisonooo 
Date: Fri, 15 Apr 2022 14:47:49 +0800
Subject: [PATCH 10/51] fix(docs): remove redundant doc tree (#360)

---
 docs/en/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/en/index.rst b/docs/en/index.rst
index 717011adb0..e659b80678 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -24,7 +24,6 @@ You can switch between Chinese and English documents in the lower-left corner of
    tutorials/how_to_add_test_units_for_backend_ops.md
    tutorials/how_to_test_rewritten_models.md
    tutorials/how_to_use_docker.md
-   tutorials/how_to_write_config.md
    tutorials/how_to_install_mmdeploy_on_jetsons.md
 
 .. toctree::

From b9c5487d7a57a3d8b4ab7547f3007817258020d5 Mon Sep 17 00:00:00 2001
From: Chen Xin 
Date: Fri, 15 Apr 2022 15:34:15 +0800
Subject: [PATCH 11/51] fix missing ncnn_DIR & InferenceEngine_DIR (#364)

---
 docker/CPU/Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/CPU/Dockerfile b/docker/CPU/Dockerfile
index fe9df4feeb..8ebaab9e4e 100644
--- a/docker/CPU/Dockerfile
+++ b/docker/CPU/Dockerfile
@@ -103,6 +103,8 @@ RUN cd mmdeploy && rm -rf build/CM* && mkdir -p build && cd build && cmake .. \
     -DMMDEPLOY_CODEBASES=all &&\
     cmake --build . -- -j$(nproc) && cmake --install . &&\
     cd install/example  && mkdir -p build && cd build &&\
-    cmake -DMMDeploy_DIR=/root/workspace/mmdeploy/build/install/lib/cmake/MMDeploy .. &&\
+    cmake .. -DMMDeploy_DIR=/root/workspace/mmdeploy/build/install/lib/cmake/MMDeploy \
+    -DInferenceEngine_DIR=/opt/intel/openvino/deployment_tools/inference_engine/share \
+    -Dncnn_DIR=/root/workspace/ncnn/build/install/lib/cmake/ncnn &&\
     cmake --build . && export SPDLOG_LEVEL=warn &&\
     if [ -z ${VERSION} ] ; then echo "Built MMDeploy master for CPU devices successfully!" ; else echo "Built MMDeploy version v${VERSION} for CPU devices successfully!" ; fi

From a8c75deec0813a088331e100b05f54de1829d171 Mon Sep 17 00:00:00 2001
From: HinGwenWoong 
Date: Mon, 18 Apr 2022 10:25:26 +0800
Subject: [PATCH 12/51] Fix mmdet openvino dynamic 300x300 cfg base (#372)

---
 configs/mmdet/detection/detection_openvino_dynamic-300x300.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/mmdet/detection/detection_openvino_dynamic-300x300.py b/configs/mmdet/detection/detection_openvino_dynamic-300x300.py
index 1df7d12114..ee879c361e 100644
--- a/configs/mmdet/detection/detection_openvino_dynamic-300x300.py
+++ b/configs/mmdet/detection/detection_openvino_dynamic-300x300.py
@@ -1 +1 @@
-_base_ = ['../_base_/base_openvino_dynamic.py']
+_base_ = ['../_base_/base_openvino_dynamic-300x300.py']

From 957fd589028636d0b0dfb1d2a9ef9a23487e6740 Mon Sep 17 00:00:00 2001
From: Junjie <61398820+Adenialzz@users.noreply.github.com>
Date: Mon, 18 Apr 2022 15:38:34 +0800
Subject: [PATCH 13/51] Fix: add onnxruntime building option in gpu dockerfile
 (#366)

---
 docker/GPU/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/GPU/Dockerfile b/docker/GPU/Dockerfile
index 8449d7b69e..9d08e144c9 100644
--- a/docker/GPU/Dockerfile
+++ b/docker/GPU/Dockerfile
@@ -82,9 +82,10 @@ RUN cd /root/workspace/mmdeploy &&\
         -DCMAKE_CXX_COMPILER=g++ \
         -Dpplcv_DIR=/root/workspace/ppl.cv/cuda-build/install/lib/cmake/ppl \
         -DTENSORRT_DIR=${TENSORRT_DIR} \
+        -DONNXRUNTIME_DIR=${ONNXRUNTIME_DIR} \
         -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \
         -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \
-        -DMMDEPLOY_TARGET_BACKENDS="trt" \
+        -DMMDEPLOY_TARGET_BACKENDS="ort;trt" \
         -DMMDEPLOY_CODEBASES=all &&\
     make -j$(nproc) && make install &&\
     cd install/example  && mkdir -p build && cd build &&\

From 9dad97e1b714c094505aa0bdfc22f3ed376483da Mon Sep 17 00:00:00 2001
From: Yifan Zhou 
Date: Sun, 24 Apr 2022 11:15:40 +0800
Subject: [PATCH 14/51] Tutorial 03: torch2onnx (#365)

* upload doc

* add images

* resolve comments

* update translation
---
 .../tutorials/chapter_03_pytorch2onnx.md      | 294 ++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md

diff --git a/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md b/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md
new file mode 100644
index 0000000000..e7f8d2ed99
--- /dev/null
+++ b/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md
@@ -0,0 +1,294 @@
+ONNX 是目前模型部署中最重要的中间表示之一。学懂了 ONNX 的技术细节,就能规避大量的模型部署问题。从这篇文章开始,在接下来的三篇文章里,我们将由浅入深地介绍 ONNX 相关的知识。在第一篇文章里,我们会介绍更多 PyTorch 转 ONNX 的细节,让大家完全掌握把简单的 PyTorch 模型转成 ONNX 模型的方法;在第二篇文章里,我们将介绍如何在 PyTorch 中支持更多的 ONNX 算子,让大家能彻底走通 PyTorch 到 ONNX 这条部署路线;第三篇文章里,我们讲介绍 ONNX 本身的知识,以及修改、调试 ONNX 模型的常用方法,使大家能自行解决大部分和 ONNX 有关的部署问题。
+
+在把 PyTorch 模型转换成 ONNX 模型时,我们往往只需要轻松地调用一句`torch.onnx.export`就行了。这个函数的接口看上去简单,但它在使用上还有着诸多的“潜规则”。在这篇教程中,我们会详细介绍 PyTorch 模型转 ONNX 模型的原理及注意事项。除此之外,我们还会介绍 PyTorch 与 ONNX 的算子对应关系,以教会大家如何处理 PyTorch 模型转换时可能会遇到的算子支持问题。
+
+## `torch.onnx.export` 细解
+在这一节里,我们将详细介绍 PyTorch 到 ONNX 的转换函数—— torch.onnx.export。我们希望大家能够更加灵活地使用这个模型转换接口,并通过了解它的实现原理来更好地应对该函数的报错(由于模型部署的兼容性问题,部署复杂模型时该函数时常会报错)。
+### 计算图导出方法
+[TorchScript](https://pytorch.org/docs/stable/jit.html) 是一种序列化和优化 PyTorch 模型的格式,在优化过程中,一个`torch.nn.Module`模型会被转换成 TorchScript 的`torch.jit.ScriptModule`模型。现在, TorchScript 也被常当成一种中间表示使用。我们在[其他文章](https://zhuanlan.zhihu.com/p/486914187)中对 TorchScript 有详细的介绍,这里介绍 TorchScript 仅用于说明 PyTorch 模型转 ONNX的原理。
+`torch.onnx.export`中需要的模型实际上是一个`torch.jit.ScriptModule`。而要把普通 PyTorch 模型转一个这样的 TorchScript 模型,有跟踪(trace)和脚本化(script)两种导出计算图的方法。如果给`torch.onnx.export`传入了一个普通 PyTorch 模型(`torch.nn.Module`),那么这个模型会默认使用跟踪的方法导出。这一过程如下图所示:
+
+![image](https://user-images.githubusercontent.com/47652064/163531613-9eb3c851-933e-4b0d-913a-bf92ac36e80b.png)
+
+回忆一下我们[第一篇教程](./chapter_01_introduction_to_model_deployment.md)知识:跟踪法只能通过实际运行一遍模型的方法导出模型的静态图,即无法识别出模型中的控制流(如循环);脚本化则能通过解析模型来正确记录所有的控制流。我们以下面这段代码为例来看一看这两种转换方法的区别:
+
+```python
+import torch
+
+class Model(torch.nn.Module):
+    def __init__(self, n):
+        super().__init__()
+        self.n = n
+        self.conv = torch.nn.Conv2d(3, 3, 3)
+
+    def forward(self, x):
+        for i in range(self.n):
+            x = self.conv(x)
+        return x
+
+
+models = [Model(2), Model(3)]
+model_names = ['model_2', 'model_3']
+
+for model, model_name in zip(models, model_names):
+    dummy_input = torch.rand(1, 3, 10, 10)
+    dummy_output = model(dummy_input)
+    model_trace = torch.jit.trace(model, dummy_input)
+    model_script = torch.jit.script(model)
+
+    # 跟踪法与直接 torch.onnx.export(model, ...)等价
+    torch.onnx.export(model_trace, dummy_input, f'{model_name}_trace.onnx', example_outputs=dummy_output)
+    # 脚本化必须先调用 torch.jit.sciprt
+    torch.onnx.export(model_script, dummy_input, f'{model_name}_script.onnx', example_outputs=dummy_output)
+```
+
+在这段代码里,我们定义了一个带循环的模型,模型通过参数`n`来控制输入张量被卷积的次数。之后,我们各创建了一个`n=2`和`n=3`的模型。我们把这两个模型分别用跟踪和脚本化的方法进行导出。
+值得一提的是,由于这里的两个模型(`model_trace`, `model_script`)是 TorchScript 模型,`export`函数已经不需要再运行一遍模型了。(如果模型是用跟踪法得到的,那么在执行`torch.jit.trace`的时候就运行过一遍了;而用脚本化导出时,模型不需要实际运行)参数中的`dummy_input`和`dummy_output`仅仅是为了获取输入和输出张量的类型和形状。
+运行上面的代码,我们把得到的4个 onnx 文件用 Netron 可视化:
+
+![image](https://user-images.githubusercontent.com/47652064/163531637-994ffa0a-847d-4c0d-a9e3-0ecd78c9a3aa.png)
+
+首先看跟踪法得到的 ONNX 模型结构。可以看出来,对于不同的 `n`,ONNX 模型的结构是不一样的。
+
+![image](https://user-images.githubusercontent.com/47652064/163531659-b06e5df2-6e18-462e-82ff-b16d95b9765c.png)
+
+而用脚本化的话,最终的 ONNX 模型用 `Loop` 节点来表示循环。这样哪怕对于不同的 `n`,ONNX 模型也有同样的结构。
+由于推理引擎对静态图的支持更好,通常我们在模型部署时不需要显式地把 PyTorch 模型转成 TorchScript 模型,直接把 PyTorch 模型用 `torch.onnx.export` 跟踪导出即可。了解这部分的知识主要是为了在模型转换报错时能够更好地定位问题是否发生在 PyTorch 转 TorchScript 阶段。
+### 参数讲解
+了解完转换函数的原理后,我们来详细介绍一下该函数的主要参数的作用。我们主要会从应用的角度来介绍每个参数在不同的模型部署场景中应该如何设置,而不会去列出每个参数的所有设置方法。该函数详细的 API 文档可参考 [torch.onnx ‒ PyTorch 1.11.0 documentation](https://pytorch.org/docs/stable/onnx.html#functions)
+
+`torch.onnx.export` 在 `torch.onnx.__init__.py`文件中的定义如下:
+```python
+def export(model, args, f, export_params=True, verbose=False, training=TrainingMode.EVAL,
+           input_names=None, output_names=None, aten=False, export_raw_ir=False,
+           operator_export_type=None, opset_version=None, _retain_param_name=True,
+           do_constant_folding=True, example_outputs=None, strip_doc_string=True,
+           dynamic_axes=None, keep_initializers_as_inputs=None, custom_opsets=None,
+           enable_onnx_checker=True, use_external_data_format=False):
+```
+前三个必选参数为模型、模型输入、导出的 onnx 文件名,我们对这几个参数已经很熟悉了。我们来着重看一下后面的一些常用可选参数。
+#### export_params
+模型中是否存储模型权重。一般中间表示包含两大类信息:模型结构和模型权重,这两类信息可以在同一个文件里存储,也可以分文件存储。ONNX 是用同一个文件表示记录模型的结构和权重的。
+我们部署时一般都默认这个参数为 True。如果 onnx 文件是用来在不同框架间传递模型(比如 PyTorch 到 Tensorflow)而不是用于部署,则可以令这个参数为 False。
+#### input_names, output_names
+设置输入和输出张量的名称。如果不设置的话,会自动分配一些简单的名字(如数字)。
+ONNX 模型的每个输入和输出张量都有一个名字。很多推理引擎在运行 ONNX 文件时,都需要以“名称-张量值”的数据对来输入数据,并根据输出张量的名称来获取输出数据。在进行跟张量有关的设置(比如添加动态维度)时,也需要知道张量的名字。
+在实际的部署流水线中,我们都需要设置输入和输出张量的名称,并保证 ONNX 和推理引擎中使用同一套名称。
+#### opset_version
+转换时参考哪个 ONNX 算子集版本,默认为9。后文会详细介绍 PyTorch 与 ONNX 的算子对应关系。
+#### dynamic_axes
+指定输入输出张量的哪些维度是动态的。
+为了追求效率,ONNX 默认所有参与运算的张量都是静态的(张量的形状不发生改变)。但在实际应用中,我们又希望模型的输入张量是动态的,尤其是本来就没有形状限制的全卷积模型。因此,我们需要显式地指明输入输出张量的哪几个维度的大小是可变的。
+我们来看一个`dynamic_axes`的设置例子:
+```python
+import torch
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 3, 3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+model = Model()
+dummy_input = torch.rand(1, 3, 10, 10)
+model_names = ['model_static.onnx',
+'model_dynamic_0.onnx',
+'model_dynamic_23.onnx']
+
+dynamic_axes_0 = {
+    'in' : [0],
+    'out' : [0]
+}
+dynamic_axes_23 = {
+    'in' : [2, 3],
+    'out' : [2, 3]
+}
+
+torch.onnx.export(model, dummy_input, model_names[0],
+input_names=['in'], output_names=['out'])
+torch.onnx.export(model, dummy_input, model_names[1],
+input_names=['in'], output_names=['out'], dynamic_axes=dynamic_axes_0)
+torch.onnx.export(model, dummy_input, model_names[2],
+input_names=['in'], output_names=['out'], dynamic_axes=dynamic_axes_23)
+```
+首先,我们导出3个 ONNX 模型,分别为没有动态维度、第0维动态、第2第3维动态的模型。
+在这份代码里,我们是用列表的方式表示动态维度,例如:
+```python
+dynamic_axes_0 = {
+    'in' : [0],
+    'out' : [0]
+}
+``
+由于 ONNX 要求每个动态维度都有一个名字,这样写的话会引出一条 UserWarning,警告我们通过列表的方式设置动态维度的话系统会自动为它们分配名字。一种显式添加动态维度名字的方法如下:
+```python
+dynamic_axes_0 = {
+    'in' : {0: 'batch'},
+    'out' : {0: 'batch'}
+}
+```
+
+由于在这份代码里我们没有更多的对动态维度的操作,因此简单地用列表指定动态维度即可。
+之后,我们用下面的代码来看一看动态维度的作用:
+```python
+import onnxruntime
+import numpy as np
+
+origin_tensor = np.random.rand(1, 3, 10, 10).astype(np.float32)
+mult_batch_tensor = np.random.rand(2, 3, 10, 10).astype(np.float32)
+big_tensor = np.random.rand(1, 3, 20, 20).astype(np.float32)
+
+inputs = [origin_tensor, mult_batch_tensor, big_tensor]
+exceptions = dict()
+
+for model_name in model_names:
+    for i, input in enumerate(inputs):
+        try:
+            ort_session = onnxruntime.InferenceSession(model_name)
+            ort_inputs = {'in': input}
+            ort_session.run(['out'], ort_inputs)
+        except Exception as e:
+            exceptions[(i, model_name)] = e
+            print(f'Input[{i}] on model {model_name} error.')
+        else:
+            print(f'Input[{i}] on model {model_name} succeed.')
+```
+我们在模型导出计算图时用的是一个形状为`(1, 3, 10, 10)`的张量。现在,我们来尝试以形状分别是`(1, 3, 10, 10), (2, 3, 10, 10), (1, 3, 20, 20)`为输入,用ONNX Runtime运行一下这几个模型,看看哪些情况下会报错,并保存对应的报错信息。得到的输出信息应该如下:
+```python
+Input[0] on model model_static.onnx succeed.
+Input[1] on model model_static.onnx error.
+Input[2] on model model_static.onnx error.
+Input[0] on model model_dynamic_0.onnx succeed.
+Input[1] on model model_dynamic_0.onnx succeed.
+Input[2] on model model_dynamic_0.onnx error.
+Input[0] on model model_dynamic_23.onnx succeed.
+Input[1] on model model_dynamic_23.onnx error.
+Input[2] on model model_dynamic_23.onnx succeed.
+```
+可以看出,形状相同的`(1, 3, 10, 10)`的输入在所有模型上都没有出错。而对于batch(第0维)或者长宽(第2、3维)不同的输入,只有在设置了对应的动态维度后才不会出错。我们可以错误信息中找出是哪些维度出了问题。比如我们可以用以下代码查看`input[1]`在`model_static.onnx`中的报错信息:
+```python
+print(exceptions[(1, 'model_static.onnx')])
+
+# output
+# [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: in for the following indices index: 0 Got: 2 Expected: 1 Please fix either the inputs or the model.
+```
+
+这段报错告诉我们名字叫`in`的输入的第0维不匹配。本来该维的长度应该为1,但我们的输入是2。实际部署中,如果我们碰到了类似的报错,就可以通过设置动态维度来解决问题。
+### 使用技巧
+通过学习之前的知识,我们基本掌握了 `torch.onnx.export` 函数的部分实现原理和参数设置方法,足以完成简单模型的转换了。但在实际应用中,使用该函数还会踩很多坑。这里我们模型部署团队把在实战中积累的一些经验分享给大家。
+#### 使模型在 ONNX 转换时有不同的行为
+有些时候,我们希望模型在直接用 PyTorch 推理时有一套逻辑,而在导出的ONNX模型中有另一套逻辑。比如,我们可以把一些后处理的逻辑放在模型里,以简化除运行模型之外的其他代码。`torch.onnx.is_in_onnx_export()`可以实现这一任务,该函数仅在执行 `torch.onnx.export()`时为真。以下是一个例子:
+```python
+import torch
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 3, 3)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if torch.onnx.is_in_onnx_export():
+            x = torch.clip(x, 0, 1)
+        return x
+```
+
+这里,我们仅在模型导出时把输出张量的数值限制在[0, 1]之间。使用 `is_in_onnx_export` 确实能让我们方便地在代码中添加和模型部署相关的逻辑。但是,这些代码对只关心模型训练的开发者和用户来说很不友好,突兀的部署逻辑会降低代码整体的可读性。同时,`is_in_onnx_export` 只能在每个需要添加部署逻辑的地方都“打补丁”,难以进行统一的管理。我们之后会介绍如何使用 MMDeploy 的重写机制来规避这些问题。
+#### 利用中断张量跟踪的操作
+PyTorch 转 ONNX 的跟踪导出法是不是万能的。如果我们在模型中做了一些很“出格”的操作,跟踪法会把某些取决于输入的中间结果变成常量,从而使导出的ONNX模型和原来的模型有出入。以下是一个会造成这种“跟踪中断”的例子:
+```python
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * x[0].item()
+        return x, torch.Tensor([i for i in x])
+
+model = Model()
+dummy_input = torch.rand(10)
+torch.onnx.export(model, dummy_input, 'a.onnx')
+```
+
+如果你尝试去导出这个模型,会得到一大堆 warning,告诉你转换出来的模型可能不正确。这也难怪,我们在这个模型里使用了`.item()`把 torch 中的张量转换成了普通的 Python 变量,还尝试遍历 torch 张量,并用一个列表新建一个 torch 张量。这些涉及张量与普通变量转换的逻辑都会导致最终的 ONNX 模型不太正确。
+另一方面,我们也可以利用这个性质,在保证正确性的前提下令模型的中间结果变成常量。这个技巧常常用于模型的静态化上,即令模型中所有的张量形状都变成常量。在未来的教程中,我们会在部署实例中详细介绍这些“高级”操作。
+#### 使用张量为输入(PyTorch版本 < 1.9.0)
+正如我们第一篇教程所展示的,在较旧(< 1.9.0)的 PyTorch 中把 Python 数值作为 `torch.onnx.export()`的模型输入时会报错。出于兼容性的考虑,我们还是推荐以张量为模型转换时的模型输入。
+## PyTorch 对 ONNX 的算子支持
+在确保`torch.onnx.export()`的调用方法无误后,PyTorch 转 ONNX 时最容易出现的问题就是算子不兼容了。这里我们会介绍如何判断某个 PyTorch 算子在 ONNX 中是否兼容,以助大家在碰到报错时能更好地把错误归类。而具体添加算子的方法我们会在之后的文章里介绍。
+在转换普通的`torch.nn.Module`模型时,PyTorch 一方面会用跟踪法执行前向推理,把遇到的算子整合成计算图;另一方面,PyTorch 还会把遇到的每个算子翻译成 ONNX 中定义的算子。在这个翻译过程中,可能会碰到以下情况:
+- 该算子可以一对一地翻译成一个 ONNX 算子。
+- 该算子在 ONNX 中没有直接对应的算子,会翻译成一至多个 ONNX 算子。
+- 该算子没有定义翻译成 ONNX 的规则,报错。
+
+那么,该如何查看 PyTorch 算子与 ONNX 算子的对应情况呢?由于 PyTorch 算子是向 ONNX 对齐的,这里我们先看一下 ONNX 算子的定义情况,再看一下 PyTorch 定义的算子映射关系。
+### ONNX 算子文档
+ONNX 算子的定义情况,都可以在官方的[算子文档](https://github.com/onnx/onnx/blob/main/docs/Operators.md)中查看。这份文档十分重要,我们碰到任何和 ONNX 算子有关的问题都得来”请教“这份文档。
+
+![image](https://user-images.githubusercontent.com/47652064/163531682-306991b9-1ffe-49fe-8aee-be27b618b096.png)
+
+这份文档中最重要的开头的这个算子变更表格。表格的第一列是算子名,第二列是该算子发生变动的算子集版本号,也就是我们之前在`torch.onnx.export`中提到的`opset_version`表示的算子集版本号。通过查看算子第一次发生变动的版本号,我们可以知道某个算子是从哪个版本开始支持的;通过查看某算子小于等于`opset_version`的第一个改动记录,我们可以知道当前算子集版本中该算子的定义规则。
+
+![image](https://user-images.githubusercontent.com/47652064/163531690-2d70e6d2-728b-4f7f-8f5a-efaaf620ff02.png)
+
+通过点击表格中的链接,我们可以查看某个算子的输入、输出参数规定及使用示例。比如上图是Relu在 ONNX 中的定义规则,这份定义表明 Relu 应该有一个输入和一个输入,输入输出的类型相同,均为 tensor。
+### PyTorch 对 ONNX 算子的映射
+在 PyTorch 中,和 ONNX 有关的定义全部放在 [torch.onnx 目录](https://github.com/pytorch/pytorch/tree/master/torch/onnx)中,如下图所示:
+
+![image](https://user-images.githubusercontent.com/47652064/163531700-ddf994e5-6989-483c-a1a3-f1b50dfd84f0.png)
+
+其中,`symbloic_opset{n}.py`(符号表文件)即表示 PyTorch 在支持第 n 版 ONNX 算子集时新加入的内容。我们之前讲过, bicubic 插值是在第 11 个版本开始支持的。我们以它为例来看看如何查找算子的映射情况。
+首先,使用搜索功能,在`torch/onnx`文件夹搜索"bicubic",可以发现这个这个插值在第 11 个版本的定义文件中:
+
+![image](https://user-images.githubusercontent.com/47652064/163531714-7cf9b784-5b7f-4438-ba01-8cff4c7c9ddc.png)
+
+之后,我们按照代码的调用逻辑,逐步跳转直到最底层的 ONNX 映射函数:
+```python
+upsample_bicubic2d = _interpolate("upsample_bicubic2d", 4, "cubic")
+
+->
+
+def _interpolate(name, dim, interpolate_mode):
+    return sym_help._interpolate_helper(name, dim, interpolate_mode)
+
+->
+
+def _interpolate_helper(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        ...
+
+    return symbolic_fn
+```
+最后,在`symbolic_fn`中,我们可以看到插值算子是怎么样被映射成多个 ONNX 算子的。其中,每一个`g.op`就是一个 ONNX 的定义。比如其中的 `Resize` 算子就是这样写的:
+```python
+    return g.op("Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor")  # only valid when mode="nearest"
+```
+通过在前面提到的 ONNX 算子文档中查找 [Resize 算子的定义](https://github.com/onnx/onnx/blob/main/docs/Operators.md#resize),我们就可以知道这每一个参数的含义了。用类似的方法,我们可以去查询其他 ONNX 算子的参数含义,进而知道 PyTorch 中的参数是怎样一步一步传入到每个 ONNX 算子中的。
+掌握了如何查询 PyTorch 映射到 ONNX 的关系后,我们在实际应用时就可以在 `torch.onnx.export()`的`opset_version`中先预设一个版本号,碰到了问题就去对应的 PyTorch 符号表文件里去查。如果某算子确实不存在,或者算子的映射关系不满足我们的要求,我们就可能得用其他的算子绕过去,或者自定义算子了。
+## 总结
+在这篇教程中,我们系统地介绍了 PyTorch 转 ONNX 的原理。我们先是着重讲解了使用最频繁的 `torch.onnx.export`函数,又给出了查询 PyTorch 对 ONNX 算子支持情况的方法。通过本文,我们希望大家能够成功转换出大部分不需要添加新算子的 ONNX 模型,并在碰到算子问题时能够有效定位问题原因。具体而言,大家读完本文后应该了解以下的知识:
+- 跟踪法和脚本化在导出带控制语句的计算图时有什么区别。
+- `torch.onnx.export()`中该如何设置 i`nput_names, output_names, dynamic_axes`。
+- 使用 `torch.onnx.is_in_onnx_export()`来使模型在转换到 ONNX 时有不同的行为。
+- 如何查询 [ONNX 算子文档](https://github.com/onnx/onnx/blob/main/docs/Operators.md)。
+- 如何查询 PyTorch 对某个 ONNX 版本的新特性支持情况。
+- 如何判断 PyTorch 对某个 ONNX 算子是否支持,支持的方法是怎样的。
+
+这期介绍的知识比较抽象,大家会不会觉得有点“水”?没关系,下一期教程中,我们将以给出代码实例的形式,介绍多种为 PyTorch 转 ONNX 添加算子支持的方法,为大家在 PyTorch 转 ONNX 这条路上扫除更多的障碍。敬请期待哦!
+## 练习
+1. Asinh 算子出现于第 9 个 ONNX 算子集。PyTorch 在 9 号版本的符号表文件中是怎样支持这个算子的?
+2. BitShift 算子出现于第11个 ONNX 算子集。PyTorch 在 11 号版本的符号表文件中是怎样支持这个算子的?
+3. 在[第一篇教程](./chapter_01_introduction_to_model_deployment.md)中,我们讲过 PyTorch (截至第 11 号算子集)不支持在插值中设置动态的放缩系数。这个系数对应 `torch.onnx.symbolic_helper._interpolate_helper`的symbolic_fn的Resize算子映射关系中的哪个参数?我们是如何修改这一参数的?
+
+练习的答案会在下期教程中揭晓。

From 85f17789d182b0feeb81bce8e3a69804411430d6 Mon Sep 17 00:00:00 2001
From: hanrui1sensetime <83800577+hanrui1sensetime@users.noreply.github.com>
Date: Sun, 24 Apr 2022 11:18:33 +0800
Subject: [PATCH 15/51] [Docs] fix ncnn docs (#378)

* fix ncnn docs`

* update 0216
---
 docs/en/backends/ncnn.md    | 6 +++---
 docs/en/build/android.md    | 4 ++--
 docs/en/build/linux.md      | 2 +-
 docs/zh_cn/build/android.md | 4 ++--
 docs/zh_cn/build/linux.md   | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/en/backends/ncnn.md b/docs/en/backends/ncnn.md
index dacfadec33..18db4bf2f7 100644
--- a/docs/en/backends/ncnn.md
+++ b/docs/en/backends/ncnn.md
@@ -1,6 +1,6 @@
 ## ncnn Support
 
-MMDeploy now supports ncnn version == 1.0.20211208
+MMDeploy now supports ncnn version == 1.0.20220216
 
 ### Installation
 
@@ -27,7 +27,7 @@ You should ensure your gcc satisfies `gcc >= 6`.
 
     - Download ncnn source code
         ```bash
-        git clone -b 20211208 git@github.com:Tencent/ncnn.git
+        git clone -b 20220216 git@github.com:Tencent/ncnn.git
         ```
 
     - Make install ncnn library
@@ -82,7 +82,7 @@ If you haven't installed NCNN in the default path, please add `-Dncnn_DIR` flag
 
 #### Reminder
 
-- In ncnn version >= 1.0.20201208, the dimension of ncnn.Mat should be no more than 4.
+- In ncnn version >= 1.0.20220216, the dimension of ncnn.Mat should be no more than 4.
 
 ### FAQs
 
diff --git a/docs/en/build/android.md b/docs/en/build/android.md
index c5bbb3eb17..cfcc7b4bed 100644
--- a/docs/en/build/android.md
+++ b/docs/en/build/android.md
@@ -90,9 +90,9 @@ export OPENCV_ANDROID_SDK_DIR=${PWD}/OpenCV-android-sdk
   
     ncnn 
     A high-performance neural network inference computing framework supporting for android.
- Now, MMDeploy supports v20211208 and has to use git clone to download it.
+ Now, MMDeploy supports v20220216 and has to use git clone to download it.

-git clone -b 20211208 https://github.com/Tencent/ncnn.git
+git clone -b 20220216 https://github.com/Tencent/ncnn.git
 cd ncnn
 git submodule update --init
 export NCNN_DIR=${PWD}
diff --git a/docs/en/build/linux.md b/docs/en/build/linux.md
index 318444cf3e..095c2a364e 100644
--- a/docs/en/build/linux.md
+++ b/docs/en/build/linux.md
@@ -330,7 +330,7 @@ export MMDEPLOY_DIR=$(pwd)
     3. pplnn: PPL.NN. pplnn_DIR is needed.
 
-Dpplnn_DIR=${PPLNN_DIR}
4. ncnn: ncnn. ncnn_DIR is needed. -
-Dncnn_DIR=${NCNN_DIR}
+
-Dncnn_DIR=${NCNN_DIR}/build/install/lib/cmake/ncnn
5. openvino: OpenVINO. InferenceEngine_DIR is needed.
-DInferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
6. torchscript: TorchScript. Torch_DIR is needed. diff --git a/docs/zh_cn/build/android.md b/docs/zh_cn/build/android.md index 9bbe67ea3b..7c7a9f31fa 100644 --- a/docs/zh_cn/build/android.md +++ b/docs/zh_cn/build/android.md @@ -90,9 +90,9 @@ export OPENCV_ANDROID_SDK_DIR=${PWD}/OpenCV-android-sdk ncnn ncnn 是支持 android 平台的高效神经网络推理计算框架
- 目前, MMDeploy 支持 ncnn 的 20211208 版本, 且必须使用git clone 下载源码的方式安装
+ 目前, MMDeploy 支持 ncnn 的 20220216 版本, 且必须使用git clone 下载源码的方式安装

-git clone -b 20211208 https://github.com/Tencent/ncnn.git
+git clone -b 20220216 https://github.com/Tencent/ncnn.git
 cd ncnn
 git submodule update --init
 export NCNN_DIR=${PWD}
diff --git a/docs/zh_cn/build/linux.md b/docs/zh_cn/build/linux.md
index 4dad6d5a6a..627b0d75a3 100644
--- a/docs/zh_cn/build/linux.md
+++ b/docs/zh_cn/build/linux.md
@@ -320,7 +320,7 @@ export MMDEPLOY_DIR=$(pwd)
     3. pplnn: 表示 PPL.NN。需要设置 pplnn_DIR
 
-Dpplnn_DIR=${PPLNN_DIR}
4. ncnn: 表示 ncnn。需要设置 ncnn_DIR -
-Dncnn_DIR=${NCNN_DIR}
+
-Dncnn_DIR=${NCNN_DIR}/build/install/lib/cmake/ncnn
5. openvino: 表示 OpenVINO。需要设置 InferenceEngine_DIR
-DInferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
6. torchscript: TorchScript. 需要设置Torch_DIR From f9144f78db0a703b1f02dc87e48231856e2ce6d9 Mon Sep 17 00:00:00 2001 From: "q.yao" Date: Mon, 25 Apr 2022 10:19:03 +0800 Subject: [PATCH 16/51] typo-fix (#397) --- csrc/apis/python/pose_detector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/apis/python/pose_detector.cpp b/csrc/apis/python/pose_detector.cpp index 36e024f1a1..da1af9af2d 100644 --- a/csrc/apis/python/pose_detector.cpp +++ b/csrc/apis/python/pose_detector.cpp @@ -14,7 +14,7 @@ class PyPoseDedector { auto status = mmdeploy_pose_detector_create_by_path(model_path, device_name, device_id, &handle_); if (status != MM_SUCCESS) { - throw std::runtime_error("failed to create pose_detedtor"); + throw std::runtime_error("failed to create pose_detector"); } } py::list Apply(const std::vector &imgs, const std::vector> &_boxes) { From ee265939baf54fa361f6d44fe87375241faa1dd3 Mon Sep 17 00:00:00 2001 From: Chen Xin Date: Mon, 25 Apr 2022 10:36:37 +0800 Subject: [PATCH 17/51] add CUDA_TOOKIT_ROOT_DIR as tensorrt detect dir (#357) * add CUDA_TOOKIT_ROOT_DIR as tensorrt detect dir * Update FindTENSORRT.cmake --- cmake/modules/FindTENSORRT.cmake | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cmake/modules/FindTENSORRT.cmake b/cmake/modules/FindTENSORRT.cmake index 0786413e79..e2c328923e 100644 --- a/cmake/modules/FindTENSORRT.cmake +++ b/cmake/modules/FindTENSORRT.cmake @@ -9,22 +9,24 @@ endif() find_path( TENSORRT_INCLUDE_DIR NvInfer.h - HINTS ${TENSORRT_DIR} + HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES include) if (NOT TENSORRT_INCLUDE_DIR) - message(FATAL_ERROR "Cannot find TensorRT header NvInfer.h, " - "please check if the path is correct") + message(FATAL_ERROR "Cannot find TensorRT header NvInfer.h " + "in TENSORRT_DIR: ${TENSORRT_DIR} or in CUDA_TOOLKIT_ROOT_DIR: " + "${CUDA_TOOLKIT_ROOT_DIR}, please check if the path is correct.") endif () set(__TENSORRT_LIB_COMPONENTS nvinfer;nvinfer_plugin) foreach(__component ${__TENSORRT_LIB_COMPONENTS}) find_library( __component_path ${__component} - HINTS ${TENSORRT_DIR} + HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 lib/x64) if (NOT __component_path) - message(FATAL_ERROR "Cannot find TensorRT lib ${__component}, " + message(FATAL_ERROR "Cannot find TensorRT lib ${__component} in " + "TENSORRT_DIR: ${TENSORRT_DIR} or CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, " "please check if the path is correct") endif() From f6fcee5f12085a70eb5f5fc0804a5cdaef97fd9c Mon Sep 17 00:00:00 2001 From: Song Lin <92794867+triple-Mu@users.noreply.github.com> Date: Mon, 25 Apr 2022 11:37:50 +0800 Subject: [PATCH 18/51] Fix docs (#398) --- docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md index 58f98d22dc..1b8deb1f2f 100644 --- a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md +++ b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md @@ -71,7 +71,7 @@ Then just clone and compile the project: ``` git clone git@github.com:pytorch/vision.git cd vision -git co tags/v0.7.0 -b vision07 +git checkout tags/v0.7.0 -b vision07 pip install -e . ``` From 53ad86d6d47ebb667907d19aa3cb051893034dfe Mon Sep 17 00:00:00 2001 From: zly19540609 <31341706+zly19540609@users.noreply.github.com> Date: Tue, 26 Apr 2022 23:06:39 +0800 Subject: [PATCH 19/51] ort_net ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL (#383) --- csrc/net/ort/ort_net.cpp | 1 + csrc/net/trt/trt_net.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/csrc/net/ort/ort_net.cpp b/csrc/net/ort/ort_net.cpp index 10ab9f6e1f..04b597a092 100644 --- a/csrc/net/ort/ort_net.cpp +++ b/csrc/net/ort/ort_net.cpp @@ -23,6 +23,7 @@ static Result ConvertElementType(ONNXTensorElementDataType type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: return DataType::kHALF; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: return DataType::kINT8; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return DataType::kINT32; diff --git a/csrc/net/trt/trt_net.cpp b/csrc/net/trt/trt_net.cpp index 9300aad10e..6caad3ed8f 100644 --- a/csrc/net/trt/trt_net.cpp +++ b/csrc/net/trt/trt_net.cpp @@ -88,6 +88,7 @@ static Result MapDataType(nvinfer1::DataType dtype) { case nvinfer1::DataType::kHALF: return DataType::kHALF; case nvinfer1::DataType::kINT8: + case nvinfer1::DataType::kBOOL: return DataType::kINT8; case nvinfer1::DataType::kINT32: return DataType::kINT32; From d9976c4b761b993b813c029901c5d5f3b81fec6d Mon Sep 17 00:00:00 2001 From: Chen Xin Date: Tue, 26 Apr 2022 23:15:05 +0800 Subject: [PATCH 20/51] fix wrong buffer which will case onnxruntime-gpu crash with segmentaion (#363) * fix wrong buffer which will case onnxruntime-gpu crash with segmentaion * fix check * fix build error * remove unused header --- csrc/codebase/mmseg/segment.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/csrc/codebase/mmseg/segment.cpp b/csrc/codebase/mmseg/segment.cpp index 8d5aeef08e..9735fb4848 100644 --- a/csrc/codebase/mmseg/segment.cpp +++ b/csrc/codebase/mmseg/segment.cpp @@ -14,6 +14,7 @@ class ResizeMask : public MMSegmentation { explicit ResizeMask(const Value &cfg) : MMSegmentation(cfg) { try { classes_ = cfg["params"]["num_classes"].get(); + little_endian_ = IsLittleEndian(); } catch (const std::exception &e) { MMDEPLOY_ERROR("no ['params']['num_classes'] is specified in cfg: {}", cfg); throw_exception(eInvalidArgument); @@ -42,7 +43,7 @@ class ResizeMask : public MMSegmentation { // change kINT64 to 2 INT32 TensorDesc desc{ host_tensor.device(), DataType::kINT32, {1, 2, height, width}, host_tensor.name()}; - Tensor _host_tensor(desc, mask.buffer()); + Tensor _host_tensor(desc, host_tensor.buffer()); return MaskResize(_host_tensor, input_height, input_width); } else if (mask.data_type() == DataType::kINT32) { return MaskResize(host_tensor, input_height, input_width); @@ -68,15 +69,26 @@ class ResizeMask : public MMSegmentation { return to_value(output); } else { cv::Mat _dst; - cv::extractChannel(dst, _dst, 0); + int channel = little_endian_ ? 0 : dst.dims - 1; + cv::extractChannel(dst, _dst, channel); auto output_tensor = cpu::CVMat2Tensor(_dst); SegmentorOutput output{output_tensor, dst_height, dst_width, classes_}; return to_value(output); } } + bool IsLittleEndian() { + union Un { + char a; + int b; + } un; + un.b = 1; + return (int)un.a == 1; + } + protected: int classes_{}; + bool little_endian_; }; REGISTER_CODEBASE_COMPONENT(MMSegmentation, ResizeMask); From 95603486b09219bc60a2b6579fa947bfecdb5691 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 27 Apr 2022 10:30:03 +0800 Subject: [PATCH 21/51] fix benchmark (#411) --- docs/en/benchmark.md | 26 +++++++++++--------------- docs/zh_cn/benchmark.md | 25 ++++++++++--------------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/docs/en/benchmark.md b/docs/en/benchmark.md index 9e66f19b85..e1609c6f67 100644 --- a/docs/en/benchmark.md +++ b/docs/en/benchmark.md @@ -1,12 +1,15 @@ ## Benchmark ### Backends + CPU: ncnn, ONNXRuntime, OpenVINO GPU: ncnn, TensorRT, PPLNN ### Latency benchmark + #### Platform + - Ubuntu 18.04 - ncnn 20211208 - Cuda 11.3 @@ -15,6 +18,7 @@ GPU: ncnn, TensorRT, PPLNN - NVIDIA tesla T4 tensor core GPU for TensorRT. #### Other settings + - Static graph - Batch size 1 - Synchronize devices after each inference. @@ -22,12 +26,11 @@ GPU: ncnn, TensorRT, PPLNN - Warm up. For ncnn, we warm up 30 iters for all codebases. As for other backends: for classification, we warm up 1010 iters; for other codebases, we warm up 10 iters. - Input resolution varies for different datasets of different codebases. All inputs are real images except for `mmediting` because the dataset is not large enough. - Users can directly test the speed through [how_to_measure_performance_of_models.md](tutorials/how_to_measure_performance_of_models.md). And here is the benchmark in our environment. +
MMCls
- @@ -180,14 +183,12 @@ Users can directly test the speed through [how_to_measure_performance_of_models.
-
MMDet
- @@ -405,7 +406,6 @@ Users can directly test the speed through [how_to_measure_performance_of_models.
MMEdit
-
@@ -475,7 +475,6 @@ Users can directly test the speed through [how_to_measure_performance_of_models.
-
@@ -568,7 +567,6 @@ Users can directly test the speed through [how_to_measure_performance_of_models.
MMSeg
- @@ -673,7 +671,6 @@ Users can directly test the speed through [how_to_measure_performance_of_models.
-
@@ -684,7 +681,6 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut
MMCls
- @@ -781,7 +777,7 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut - + @@ -791,7 +787,7 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut - + @@ -804,7 +800,7 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut - + @@ -814,7 +810,7 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut - + @@ -837,7 +833,7 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut - + @@ -1819,8 +1815,8 @@ Users can directly test the performance through [how_to_evaluate_a_model.md](tut - ### Notes + - As some datasets contain images with various resolutions in codebase like MMDet. The speed benchmark is gained through static configs in MMDeploy, while the performance benchmark is gained through dynamic ones. - Some int8 performance benchmarks of TensorRT require Nvidia cards with tensor core, or the performance would drop heavily. diff --git a/docs/zh_cn/benchmark.md b/docs/zh_cn/benchmark.md index 3225c44fd8..2a96884ac7 100644 --- a/docs/zh_cn/benchmark.md +++ b/docs/zh_cn/benchmark.md @@ -1,6 +1,7 @@ ## 基准 ### 后端 + CPU: ncnn, ONNXRuntime, OpenVINO GPU: ncnn, TensorRT, PPLNN @@ -8,6 +9,7 @@ GPU: ncnn, TensorRT, PPLNN ### 延迟基准 #### 平台 + - Ubuntu 18.04 操作系统 - ncnn 20211208 - Cuda 11.3 @@ -16,6 +18,7 @@ GPU: ncnn, TensorRT, PPLNN - NVIDIA tesla T4 显卡. #### 其他设置 + - 静态图导出 - 批次大小为 1 - 每次推理后均同步 @@ -23,12 +26,11 @@ GPU: ncnn, TensorRT, PPLNN - 热身。 针对ncnn后端,我们热身30轮; 对于其他后端:针对分类任务,我们热身1010轮,对其他任务,我们热身10轮。 - 输入分辨率根据代码库的数据集不同而不同,除了`mmediting`,其他代码库均使用真实图片作为输入。 - 用户可以直接通过[如何测试延迟](tutorials/how_to_measure_performance_of_models.md)获得想要的速度测试结果。下面是我们环境中的测试结果: +
MMCls
-
93.84
ShuffleNetV1 1.0xShuffleNetV1 Classification top-1 68.1368.13 67.71 68.11$MMCLS_DIR/configs/shufflenet_v1/shufflenet_v1_1x_b64x16_linearlr_bn_nowd_imagenet.py$MMCLS_DIR/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py
top-587.80
ShuffleNetV2 1.0xShuffleNetV2 Classification top-1 69.5569.54 69.10 69.54$MMCLS_DIR/configs/shufflenet_v2/shufflenet_v2_1x_b64x16_linearlr_bn_nowd_imagenet.py$MMCLS_DIR/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py
top-571.87 70.91 71.84$MMEDIT_DIR/configs/restorers/real_esrgan/realesrnet_c64b23g32_12x4_lr2e-4_1000k_df2k_ost.py$MMEDIT_DIR/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
top-5
@@ -181,14 +183,12 @@ GPU: ncnn, TensorRT, PPLNN
-
MMDet
- @@ -406,7 +406,6 @@ GPU: ncnn, TensorRT, PPLNN
MMEdit
-
@@ -476,7 +475,6 @@ GPU: ncnn, TensorRT, PPLNN
-
@@ -569,7 +567,6 @@ GPU: ncnn, TensorRT, PPLNN
MMSeg
- @@ -674,7 +671,6 @@ GPU: ncnn, TensorRT, PPLNN
-
@@ -686,7 +682,6 @@ GPU: ncnn, TensorRT, PPLNN
MMCls
- @@ -783,7 +778,7 @@ GPU: ncnn, TensorRT, PPLNN - + @@ -793,7 +788,7 @@ GPU: ncnn, TensorRT, PPLNN - + @@ -806,7 +801,7 @@ GPU: ncnn, TensorRT, PPLNN - + @@ -816,7 +811,7 @@ GPU: ncnn, TensorRT, PPLNN - + @@ -839,7 +834,7 @@ GPU: ncnn, TensorRT, PPLNN - + @@ -1807,8 +1802,8 @@ GPU: ncnn, TensorRT, PPLNN - ### 注意 + - 由于某些数据集在代码库中包含各种分辨率的图像,例如 MMDet,速度基准是通过 MMDeploy 中的静态配置获得的,而性能基准是通过动态配置获得的。 - TensorRT 的一些 int8 性能基准测试需要具有 tensor core 的 Nvidia 卡,否则性能会大幅下降。 From 8e6d4defc6946aa7778d556ea51ad05dafbbde31 Mon Sep 17 00:00:00 2001 From: HinGwenWoong Date: Wed, 27 Apr 2022 10:30:56 +0800 Subject: [PATCH 22/51] Add `sm_53` in cuda.cmake for Jetson Nano which will cashe when process sdk predict. (#407) --- cmake/cuda.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 9fe42596c4..deb4717973 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -34,6 +34,7 @@ enable_language(CUDA) set(_NVCC_FLAGS) if (NOT CMAKE_CUDA_ARCHITECTURES) set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") + set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_53,code=sm_53") if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "8") set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") From cecd1ecb477ffd7a7c002604e852c1da3678fc4a Mon Sep 17 00:00:00 2001 From: lzhangzz Date: Wed, 27 Apr 2022 15:15:57 +0800 Subject: [PATCH 23/51] [Fix] fix feature test for `std::source_location` (#416) * fix feature test for `std::source_location` * suppress msvc warnings * fix consistency --- CMakeLists.txt | 1 + csrc/core/utils/source_location.h | 36 +++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 004b94d609..5befeef828 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ endif () if (MSVC) add_compile_options($<$:/diagnostics:classic>) add_compile_options($<$:/Zc:preprocessor>) # /experimental:preprocessor on VS2017 + add_compile_options($<$:/Zc:__cplusplus>) add_compile_options($<$:/wd4251>) else () add_compile_options($<$:-fvisibility=hidden>) diff --git a/csrc/core/utils/source_location.h b/csrc/core/utils/source_location.h index f0d579b76b..e87a2ee3ba 100644 --- a/csrc/core/utils/source_location.h +++ b/csrc/core/utils/source_location.h @@ -3,17 +3,31 @@ #ifndef MMDEPLOY_SRC_UTILS_SOURCE_LOCATION_H_ #define MMDEPLOY_SRC_UTILS_SOURCE_LOCATION_H_ -#if __has_include() && !_MSC_VER -#include -namespace mmdeploy { -using SourceLocation = std::source_location; -} -#elif __has_include() -#include -namespace mmdeploy { -using SourceLocation = std::experimental::source_location; -} -#else +// clang-format off +#if __has_include() && (!_MSC_VER || __cplusplus >= 202002L) + #include + #if __cpp_lib_source_location >= 201907L + #define MMDEPLOY_HAS_SOURCE_LOCATION 1 + namespace mmdeploy { + using SourceLocation = std::source_location; + } + #endif +#endif + +#ifndef MMDEPLOY_HAS_SOURCE_LOCATION + #if __has_include() + #include + #if __cpp_lib_experimental_source_location >= 201505L + #define MMDEPLOY_HAS_SOURCE_LOCATION 1 + namespace mmdeploy { + using SourceLocation = std::experimental::source_location; + } + #endif + #endif +#endif +// clang-format on + +#ifndef MMDEPLOY_HAS_SOURCE_LOCATION #include namespace mmdeploy { class SourceLocation { From 72c19e9d5d9e5d8dcc14b76e53c4ce516cdb45e0 Mon Sep 17 00:00:00 2001 From: lzhangzz Date: Wed, 27 Apr 2022 15:19:19 +0800 Subject: [PATCH 24/51] fix format string (#417) --- csrc/core/model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/core/model.cpp b/csrc/core/model.cpp index d4b6361a91..58c756223e 100644 --- a/csrc/core/model.cpp +++ b/csrc/core/model.cpp @@ -56,7 +56,7 @@ Result Model::Init(const void* buffer, size_t size) { } OUTCOME_TRY(auto meta, impl->ReadMeta()); - MMDEPLOY_INFO("{} successfully load sdk model {}", entry.name); + MMDEPLOY_INFO("successfully load sdk model {}", entry.name); impl_ = std::move(impl); meta_ = std::move(meta); return success(); From a9a41443218955c208fd040b790fe2c5bc31b5d2 Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Wed, 27 Apr 2022 16:06:50 +0800 Subject: [PATCH 25/51] [Fix] Fix seg name (#394) * fix seg name * use default name Co-authored-by: dongchunyu.vendor --- mmdeploy/codebase/mmseg/deploy/segmentation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mmdeploy/codebase/mmseg/deploy/segmentation.py b/mmdeploy/codebase/mmseg/deploy/segmentation.py index a3f1728ae2..6d6aecea6f 100644 --- a/mmdeploy/codebase/mmseg/deploy/segmentation.py +++ b/mmdeploy/codebase/mmseg/deploy/segmentation.py @@ -268,5 +268,10 @@ def get_model_name(self) -> str: """ assert 'decode_head' in self.model_cfg.model, 'model config contains' ' no decode_head' - name = self.model_cfg.model.decode_head.type[:-4].lower() + if isinstance(self.model_cfg.model.decode_head, list): + name = self.model_cfg.model.decode_head[-1].type[:-4].lower() + elif 'type' in self.model_cfg.model.decode_head: + name = self.model_cfg.model.decode_head.type[:-4].lower() + else: + name = 'mmseg_model' return name From 21230d5847729f20a4e389c8fb5b539429d550bf Mon Sep 17 00:00:00 2001 From: VVsssssk <88368822+VVsssssk@users.noreply.github.com> Date: Wed, 27 Apr 2022 20:04:56 +0800 Subject: [PATCH 26/51] =?UTF-8?q?=E3=80=90Docs=E3=80=91Add=20ipython=20not?= =?UTF-8?q?ebook=20tutorial=20(#234)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add ipynb file * rename file * add open in colab tag * fix lint and add img show * fix open in colab link * fix comments * fix pre-commit config --- .pre-commit-config.yaml | 2 +- demo/tutorials/tutorials_1.ipynb | 484 +++++++++++++++++++++++++++++++ 2 files changed, 485 insertions(+), 1 deletion(-) create mode 100755 demo/tutorials/tutorials_1.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 432a9fc627..831953b3f6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,7 +39,7 @@ repos: rev: v2.1.0 hooks: - id: codespell - args: ["--skip=third_party/*,*.proto"] + args: ["--skip=third_party/*,*.ipynb,*.proto"] - repo: https://github.com/myint/docformatter rev: v1.4 diff --git a/demo/tutorials/tutorials_1.ipynb b/demo/tutorials/tutorials_1.ipynb new file mode 100755 index 0000000000..1ea0a5fafa --- /dev/null +++ b/demo/tutorials/tutorials_1.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "mAWHDEbr6Q2i" + }, + "source": [ + "[![Open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/open-mmlab/mmdeploy/blob/master/demo/tutorials_1.ipynb)\n", + "# 前言\n", + "OpenMMLab 的算法如何部署?是很多社区用户的困惑。而模型部署工具箱 [MMDeploy](https://zhuanlan.zhihu.com/p/450342651) 的开源,强势打通了从算法模型到应用程序这 \"最后一公里\"!\n", + "今天我们将开启模型部署入门系列教程,在模型部署开源库 MMDeploy 的辅助下,介绍以下内容:\n", + "\n", + "\n", + "* 中间表示 ONNX 的定义标准\n", + "* PyTorch 模型转换到 ONNX 模型的方法\n", + "* 推理引擎 ONNX Runtime、TensorRT 的使用方法\n", + "* 部署流水线 PyTorch - ONNX - ONNX Runtime/TensorRT 的示例及常见部署问题的解决方法\n", + "* MMDeploy C/C++ 推理 SDK\n", + "希望通过本系列教程,带领大家学会如何把自己的 PyTorch 模型部署到 ONNX Runtime/TensorRT 上,并学会如何把 OpenMMLab 开源体系中各个计算机视觉任务的模型用 [MMDeploy](https://zhuanlan.zhihu.com/p/450342651) 部署到各个推理引擎上。\n", + "\n", + "**我们默认大家熟悉 Python 语言,并对 PyTorch 框架有基本的认识,除此之外不需要了解任何模型部署的知识。**\n", + "\n", + "在第一篇文章中,我们将部署一个简单的超分辨率模型,认识中间表示、推理引擎等模型部署中的概念。 \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nJxQ-uXB1ULa" + }, + "source": [ + "# 初识模型部署\n", + "在软件工程中,部署指把开发完毕的软件投入使用的过程,包括环境配置、软件安装等步骤。类似地,对于深度学习模型来说,模型部署指让训练好的模型在特定环境中运行的过程。相比于软件部署,模型部署会面临更多的难题:\n", + "\n", + "1)运行模型所需的环境难以配置。深度学习模型通常是由一些框架编写,比如 PyTorch、TensorFlow。由于框架规模、依赖环境的限制,这些框架不适合在手机、开发板等生产环境中安装。\n", + "\n", + "2)深度学习模型的结构通常比较庞大,需要大量的算力才能满足实时运行的需求。模型的运行效率需要优化。\n", + "\n", + "因为这些难题的存在,模型部署不能靠简单的环境配置与安装完成。经过工业界和学术界数年的探索,模型部署有了一条流行的流水线:\n", + "\n", + "![pipeline](https://user-images.githubusercontent.com/4560679/156556619-3da7a572-876b-4909-b26f-04e81190c546.png)\n", + "\n", + "为了让模型最终能够部署到某一环境上,开发者们可以使用任意一种**深度学习框架**来定义网络结构,并通过训练确定网络中的参数。之后,模型的结构和参数会被转换成一种只描述网络结构的**中间表示**,一些针对网络结构的优化会在中间表示上进行。最后,用面向硬件的高性能编程框架(如 CUDA,OpenCL)编写,能高效执行深度学习网络中算子的推理引擎会把中间表示转换成特定的文件格式,并在对应硬件平台上高效运行模型。\n", + "\n", + "这一条流水线解决了模型部署中的两大问题:使用对接深度学习框架和**推理引擎**的中间表示,开发者不必担心如何在新环境中运行各个复杂的框架;通过中间表示的网络结构优化和推理引擎对运算的底层优化,模型的运算效率大幅提升。\n", + "\n", + "现在,让我们从一个模型部署的“Hello World”项目入手,见识一下模型部署各方面的知识吧!\n", + "\n", + "# 部署第一个模型\n", + "## 创建 PyTorch 模型\n", + "让我们用 PyTorch 实现一个超分辨率模型,并把模型部署到 ONNX Runtime 这个推理引擎上。\n", + "\n", + "首先,我们需要创建一个有 PyTorch 库的 Python 编程环境。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dAAZ7qdJ16Jo", + "outputId": "0461102c-f669-4f16-d97a-d8a98150666b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "nvcc: NVIDIA (R) Cuda compiler driver\n", + "Copyright (c) 2005-2020 NVIDIA Corporation\n", + "Built on Mon_Oct_12_20:09:46_PDT_2020\n", + "Cuda compilation tools, release 11.1, V11.1.105\n", + "Build cuda_11.1.TC455_06.29190527_0\n", + "gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0\n", + "Copyright (C) 2017 Free Software Foundation, Inc.\n", + "This is free software; see the source for copying conditions. There is NO\n", + "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n", + "\n" + ] + } + ], + "source": [ + "# 检查nvcc版本\n", + "!nvcc -V\n", + "# 检查gcc版本\n", + "!gcc --version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Y5knBfH63KFb", + "outputId": "a1a51caa-4222-4e0c-cda3-3adb52401e0a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in links: https://download.pytorch.org/whl/lts/1.8/torch_lts.html\n", + "Collecting torch==1.8.2+cu111\n", + " Downloading https://download.pytorch.org/whl/lts/1.8/cu111/torch-1.8.2%2Bcu111-cp37-cp37m-linux_x86_64.whl (1982.2 MB)\n", + "\u001b[K |█████████████▌ | 834.1 MB 1.6 MB/s eta 0:11:43tcmalloc: large alloc 1147494400 bytes == 0x5613cfd9a000 @ 0x7f1182a1e615 0x5613964f13bc 0x5613965d218a 0x5613964f41cd 0x5613965e6b3d 0x561396568458 0x56139656302f 0x5613964f5aba 0x5613965682c0 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613964f5f19 0x561396539a79 0x5613964f4b32 0x5613965681dd 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f5aba 0x561396563eae 0x5613964f59da 0x561396564108 0x56139656302f\n", + "\u001b[K |█████████████████ | 1055.7 MB 1.4 MB/s eta 0:11:10tcmalloc: large alloc 1434370048 bytes == 0x5614143f0000 @ 0x7f1182a1e615 0x5613964f13bc 0x5613965d218a 0x5613964f41cd 0x5613965e6b3d 0x561396568458 0x56139656302f 0x5613964f5aba 0x5613965682c0 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613964f5f19 0x561396539a79 0x5613964f4b32 0x5613965681dd 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f5aba 0x561396563eae 0x5613964f59da 0x561396564108 0x56139656302f\n", + "\u001b[K |█████████████████████▋ | 1336.2 MB 1.3 MB/s eta 0:08:10tcmalloc: large alloc 1792966656 bytes == 0x561399222000 @ 0x7f1182a1e615 0x5613964f13bc 0x5613965d218a 0x5613964f41cd 0x5613965e6b3d 0x561396568458 0x56139656302f 0x5613964f5aba 0x5613965682c0 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613964f5f19 0x561396539a79 0x5613964f4b32 0x5613965681dd 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f5aba 0x561396563eae 0x5613964f59da 0x561396564108 0x56139656302f\n", + "\u001b[K |███████████████████████████▎ | 1691.1 MB 1.3 MB/s eta 0:03:43tcmalloc: large alloc 2241208320 bytes == 0x56140400a000 @ 0x7f1182a1e615 0x5613964f13bc 0x5613965d218a 0x5613964f41cd 0x5613965e6b3d 0x561396568458 0x56139656302f 0x5613964f5aba 0x5613965682c0 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613965e7986 0x561396564350 0x5613964f5f19 0x561396539a79 0x5613964f4b32 0x5613965681dd 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f5aba 0x561396563eae 0x5613964f59da 0x561396564108 0x56139656302f\n", + "\u001b[K |████████████████████████████████| 1982.2 MB 1.2 MB/s eta 0:00:01tcmalloc: large alloc 1982201856 bytes == 0x56148996c000 @ 0x7f1182a1d1e7 0x5613965275d7 0x5613964f13bc 0x5613965d218a 0x5613964f41cd 0x5613965e6b3d 0x561396568458 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x5613964f59da 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f\n", + "tcmalloc: large alloc 2477752320 bytes == 0x561574036000 @ 0x7f1182a1e615 0x5613964f13bc 0x5613965d218a 0x5613964f41cd 0x5613965e6b3d 0x561396568458 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564108 0x5613964f59da 0x561396564108 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f5aba 0x561396564cd4 0x56139656302f 0x5613964f6151\n", + "\u001b[K |████████████████████████████████| 1982.2 MB 6.6 kB/s \n", + "\u001b[?25hCollecting torchvision==0.9.2+cu111\n", + " Downloading https://download.pytorch.org/whl/lts/1.8/cu111/torchvision-0.9.2%2Bcu111-cp37-cp37m-linux_x86_64.whl (17.5 MB)\n", + "\u001b[K |████████████████████████████████| 17.5 MB 1.6 MB/s \n", + "\u001b[?25hCollecting torchaudio==0.8.2\n", + " Downloading https://download.pytorch.org/whl/lts/1.8/torchaudio-0.8.2-cp37-cp37m-linux_x86_64.whl (1.9 MB)\n", + "\u001b[K |████████████████████████████████| 1.9 MB 6.0 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torch==1.8.2+cu111) (1.21.5)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.8.2+cu111) (3.10.0.2)\n", + "Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.7/dist-packages (from torchvision==0.9.2+cu111) (7.1.2)\n", + "Installing collected packages: torch, torchvision, torchaudio\n", + " Attempting uninstall: torch\n", + " Found existing installation: torch 1.10.0+cu111\n", + " Uninstalling torch-1.10.0+cu111:\n", + " Successfully uninstalled torch-1.10.0+cu111\n", + " Attempting uninstall: torchvision\n", + " Found existing installation: torchvision 0.11.1+cu111\n", + " Uninstalling torchvision-0.11.1+cu111:\n", + " Successfully uninstalled torchvision-0.11.1+cu111\n", + " Attempting uninstall: torchaudio\n", + " Found existing installation: torchaudio 0.10.0+cu111\n", + " Uninstalling torchaudio-0.10.0+cu111:\n", + " Successfully uninstalled torchaudio-0.10.0+cu111\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.8.2+cu111 which is incompatible.\u001b[0m\n", + "Successfully installed torch-1.8.2+cu111 torchaudio-0.8.2 torchvision-0.9.2+cu111\n", + "Collecting onnxruntime==1.8.1\n", + " Downloading onnxruntime-1.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)\n", + "\u001b[K |████████████████████████████████| 4.5 MB 7.5 MB/s \n", + "\u001b[?25hCollecting onnx\n", + " Downloading onnx-1.11.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (12.8 MB)\n", + "\u001b[K |████████████████████████████████| 12.8 MB 38.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: opencv-python in /usr/local/lib/python3.7/dist-packages (4.1.2.30)\n", + "Requirement already satisfied: protobuf in /usr/local/lib/python3.7/dist-packages (from onnxruntime==1.8.1) (3.17.3)\n", + "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.7/dist-packages (from onnxruntime==1.8.1) (1.21.5)\n", + "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.7/dist-packages (from onnxruntime==1.8.1) (2.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.2.1 in /usr/local/lib/python3.7/dist-packages (from onnx) (3.10.0.2)\n", + "Requirement already satisfied: six>=1.9 in /usr/local/lib/python3.7/dist-packages (from protobuf->onnxruntime==1.8.1) (1.15.0)\n", + "Installing collected packages: onnxruntime, onnx\n", + "Successfully installed onnx-1.11.0 onnxruntime-1.8.1\n" + ] + } + ], + "source": [ + "# 安装 cuda 11.1 的 PyTorch \n", + "# 如果你用的是其他版本的 cuda,请参考 PyTorch 的官方安装教程选择安装命令 \n", + "!pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html\n", + "# 安装 ONNX Runtime, ONNX, OpenCV \n", + "!pip install onnxruntime==1.8.1 onnx opencv-python" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z-_3PEwW37Em" + }, + "source": [ + "在一切都配置完毕后,用下面的代码来创建一个经典的超分辨率模型 SRCNN。" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 521 + }, + "id": "4Bk1bkp03-DA", + "outputId": "d0a27fe1-1ac4-45b3-e37f-932a27d97d5d" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + " \n", + "import cv2 \n", + "from matplotlib import pyplot as plt\n", + "import numpy as np \n", + "import requests \n", + "import torch \n", + "import torch.onnx \n", + "from torch import nn \n", + "\n", + "class SuperResolutionNet(nn.Module):\n", + " def __init__(self, upscale_factor):\n", + " super().__init__()\n", + " self.upscale_factor = upscale_factor\n", + " self.img_upsampler = nn.Upsample(\n", + " scale_factor=self.upscale_factor,\n", + " mode='bicubic',\n", + " align_corners=False)\n", + " \n", + " self.conv1 = nn.Conv2d(3,64,kernel_size=9,padding=4)\n", + " self.conv2 = nn.Conv2d(64,32,kernel_size=1,padding=0)\n", + " self.conv3 = nn.Conv2d(32,3,kernel_size=5,padding=2)\n", + "\n", + " self.relu = nn.ReLU()\n", + " \n", + " def forward(self, x):\n", + " x = self.img_upsampler(x)\n", + " out = self.relu(self.conv1(x))\n", + " out = self.relu(self.conv2(out))\n", + " out = self.conv3(out)\n", + " return out\n", + " \n", + "# Download checkpoint and test image \n", + "urls = ['https://download.openmmlab.com/mmediting/restorers/srcnn/srcnn_x4k915_1x16_1000k_div2k_20200608-4186f232.pth', \n", + " 'https://raw.githubusercontent.com/open-mmlab/mmediting/master/tests/data/face/000001.png']\n", + "names = ['srcnn.pth', 'face.png']\n", + "for url, name in zip(urls, names):\n", + " if not os.path.exists(name):\n", + " open(name, 'wb').write(requests.get(url).content)\n", + " \n", + "def init_torch_model():\n", + " torch_model = SuperResolutionNet(upscale_factor=3)\n", + " \n", + " state_dict = torch.load('srcnn.pth')['state_dict']\n", + " \n", + " # Adapt the checkpoint\n", + " for old_key in list(state_dict.keys()):\n", + " new_key = '.'.join(old_key.split('.')[1:])\n", + " state_dict[new_key] = state_dict.pop(old_key)\n", + " \n", + " torch_model.load_state_dict(state_dict)\n", + " torch_model.eval()\n", + " return torch_model\n", + " \n", + "model = init_torch_model()\n", + "input_img = cv2.imread('face.png')\n", + "plt.imshow(cv2.cvtColor(input_img,cv2.COLOR_BGR2RGB))\n", + "plt.show()\n", + "input_img = input_img.astype(np.float32)\n", + "# HWC to NCHW \n", + "input_img = np.transpose(input_img, [2, 0, 1])\n", + "input_img = np.expand_dims(input_img, 0)\n", + " \n", + "# Inference \n", + "torch_output = model(torch.from_numpy(input_img)).detach().numpy()\n", + " \n", + "# NCHW to HWC \n", + "torch_output = np.squeeze(torch_output, 0)\n", + "torch_output = np.clip(torch_output, 0, 255)\n", + "torch_output = np.transpose(torch_output, [1, 2, 0]).astype(np.uint8)\n", + " \n", + "# Show image \n", + "cv2.imwrite(\"face_torch.png\", torch_output)\n", + "\n", + "plt.imshow(cv2.cvtColor(torch_output,cv2.COLOR_BGR2RGB))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lp02VxnL4JII" + }, + "source": [ + "SRCNN 先把图像上采样到对应分辨率,再用 3 个卷积层处理图像。为了方便起见,我们跳过训练网络的步骤,直接下载模型权重(由于 MMEditing 中 SRCNN 的权重结构和我们定义的模型不太一样,我们修改了权重字典的 key 来适配我们定义的模型),同时下载好输入图片。为了让模型输出成正确的图片格式,我们把模型的输出转换成 HWC 格式,并保证每一通道的颜色值都在 0~255 之间。如果脚本正常运行的话,一幅超分辨率的人脸照片会保存在 “face_torch.png” 中。\n", + "\n", + "![face_torch](https://user-images.githubusercontent.com/4560679/156558692-e5b82284-22d1-434b-aace-b565ac223e73.png)\n", + "\n", + "在 PyTorch 模型测试正确后,我们来正式开始部署这个模型。我们下一步的任务是把 PyTorch 模型转换成用中间表示 ONNX 描述的模型。\n", + "## 中间表示 - ONNX\n", + "在介绍 ONNX 之前,我们先从本质上来认识一下神经网络的结构。神经网络实际上只是描述了数据计算的过程,其结构可以用计算图表示。比如 a+b 可以用下面的计算图来表示:\n", + "\n", + "![a+b](https://user-images.githubusercontent.com/4560679/156558717-96bbe544-4dc7-4460-8850-3cb1790e39ec.png)\n", + "\n", + "为了加速计算,一些框架会使用对神经网络“先编译,后执行”的静态图来描述网络。静态图的缺点是难以描述控制流(比如 if-else 分支语句和 for 循环语句),直接对其引入控制语句会导致产生不同的计算图。比如循环执行 n 次 a=a+b,对于不同的 n,会生成不同的计算图:\n", + "\n", + "![n=2](https://user-images.githubusercontent.com/4560679/156558606-6ff18e19-f3b1-463f-8f83-60bf6f7ef64b.png)\n", + "\n", + "ONNX (Open Neural Network Exchange)是 Facebook 和微软在2017年共同发布的,用于标准描述计算图的一种格式。目前,在数家机构的共同维护下,ONNX 已经对接了多种深度学习框架和多种推理引擎。因此,ONNX 被当成了深度学习框架到推理引擎的桥梁,就像编译器的中间语言一样。由于各框架兼容性不一,我们通常只用 ONNX 表示更容易部署的静态图。\n", + "\n", + "让我们用下面的代码来把 PyTorch 的模型转换成 ONNX 格式的模型:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6a8bZDui474h" + }, + "outputs": [], + "source": [ + "x = torch.randn(1, 3, 256, 256)\n", + "\n", + "with torch.no_grad():\n", + " torch.onnx.export(\n", + " model,\n", + " x,\n", + " \"srcnn.onnx\",\n", + " opset_version=11,\n", + " input_names=['input'],\n", + " output_names=['output'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tEUmJIF053lI" + }, + "source": [ + "其中,**torch.onnx.export** 是 PyTorch 自带的把模型转换成 ONNX 格式的函数。让我们先看一下前三个必选参数:前三个参数分别是要转换的模型、模型的任意一组输入、导出的 ONNX 文件的文件名。转换模型时,需要原模型和输出文件名是很容易理解的,但为什么需要为模型提供一组输入呢?这就涉及到 ONNX 转换的原理了。从 PyTorch 的模型到 ONNX 的模型,本质上是一种语言上的翻译。直觉上的想法是像编译器一样彻底解析原模型的代码,记录所有控制流。但前面也讲到,我们通常只用 ONNX 记录不考虑控制流的静态图。因此,PyTorch 提供了一种叫做追踪(trace)的模型转换方法:给定一组输入,再实际执行一遍模型,即把这组输入对应的计算图记录下来,保存为 ONNX 格式。export 函数用的就是追踪导出方法,需要给任意一组输入,让模型跑起来。我们的测试图片是三通道,256x256大小的,这里也构造一个同样形状的随机张量。\n", + "\n", + "剩下的参数中,opset_version 表示 ONNX 算子集的版本。深度学习的发展会不断诞生新算子,为了支持这些新增的算子,ONNX会经常发布新的算子集,目前已经更新15个版本。我们令 opset_version = 11,即使用第11个 ONNX 算子集,是因为 SRCNN 中的 bicubic (双三次插值)在 opset11 中才得到支持。剩下的两个参数 input_names, output_names 是输入、输出 tensor 的名称,我们稍后会用到这些名称。\n", + "\n", + "如果上述代码运行成功,目录下会新增一个\"srcnn.onnx\"的 ONNX 模型文件。我们可以用下面的脚本来验证一下模型文件是否正确。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QCcECxU959TW", + "outputId": "2e46f06b-49ba-47af-d8b0-e2335d91684c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model correct\n" + ] + } + ], + "source": [ + "import onnx\n", + " \n", + "onnx_model = onnx.load(\"srcnn.onnx\")\n", + "try:\n", + " onnx.checker.check_model(onnx_model)\n", + "except Exception:\n", + " print(\"Model incorrect\")\n", + "else:\n", + " print(\"Model correct\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f9hh54Rj5_Vj" + }, + "source": [ + "其中,**onnx.load** 函数用于读取一个 ONNX 模型。**onnx.checker.check_model** 用于检查模型格式是否正确,如果有错误的话该函数会直接报错。我们的模型是正确的,控制台中应该会打印出\"Model correct\"。\n", + "\n", + "接下来,让我们来看一看 ONNX 模型具体的结构是怎么样的。我们可以使用 **Netron** (开源的模型可视化工具)来可视化 ONNX 模型。把 srcnn.onnx 文件从本地的文件系统拖入网站,即可看到如下的可视化结果:\n", + "\n", + "![model](https://user-images.githubusercontent.com/4560679/156558675-df96e7f8-0c90-4b52-81db-f80e21e522a1.png)\n", + "\n", + "点击 input 或者 output,可以查看 ONNX 模型的基本信息,包括模型的版本信息,以及模型输入、输出的名称和数据类型。\n", + "\n", + "![model_property](https://user-images.githubusercontent.com/4560679/156558624-0d77bf2c-bd01-40e3-a89c-1b0f69329576.png)\n", + "\n", + "点击某一个算子节点,可以看到算子的具体信息。比如点击第一个 Conv 可以看到:\n", + "\n", + "![node_property](https://user-images.githubusercontent.com/4560679/156558668-867ea202-9ac2-4a04-b836-91ced4f2e5ea.png)\n", + "\n", + "每个算子记录了算子属性、图结构、权重三类信息。\n", + "\n", + "* 算子属性信息即图中 attributes 里的信息,对于卷积来说,算子属性包括了卷积核大小(kernel_shape)、卷积步长(strides)等内容。这些算子属性最终会用来生成一个具体的算子。\n", + "* 图结构信息指算子节点在计算图中的名称、邻边的信息。对于图中的卷积来说,该算子节点叫做 Conv_2,输入数据叫做 11,输出数据叫做 12。根据每个算子节点的图结构信息,就能完整地复原出网络的计算图。\n", + "* 权重信息指的是网络经过训练后,算子存储的权重信息。对于卷积来说,权重信息包括卷积核的权重值和卷积后的偏差值。点击图中 conv1.weight, conv1.bias 后面的加号即可看到权重信息的具体内容。\n", + "现在,我们有了 SRCNN 的 ONNX 模型。让我们看看最后该如何把这个模型运行起来。" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P80yIyaD-SVI" + }, + "source": [ + "## 推理引擎 -ONNX Runtime\n", + "**ONNX Runtime** 是由微软维护的一个跨平台机器学习推理加速器,也就是我们前面提到的”推理引擎“。ONNX Runtime 是直接对接 ONNX 的,即 ONNX Runtime 可以直接读取并运行 .onnx 文件, 而不需要再把 .onnx 格式的文件转换成其他格式的文件。也就是说,对于 PyTorch - ONNX - ONNX Runtime 这条部署流水线,只要在目标设备中得到 .onnx 文件,并在 ONNX Runtime 上运行模型,模型部署就算大功告成了。\n", + "\n", + "通过刚刚的操作,我们把 PyTorch 编写的模型转换成了 ONNX 模型,并通过可视化检查了模型的正确性。最后,让我们用 ONNX Runtime 运行一下模型,完成模型部署的最后一步。\n", + "\n", + "ONNX Runtime 提供了 Python 接口。接着刚才的脚本,我们可以添加如下代码运行模型:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EnXyju9--UNo" + }, + "outputs": [], + "source": [ + "import onnxruntime\n", + "\n", + "ort_session = onnxruntime.InferenceSession(\"srcnn.onnx\")\n", + "ort_inputs = {'input': input_img}\n", + "ort_output = ort_session.run(['output'], ort_inputs)[0]\n", + "\n", + "ort_output = np.squeeze(ort_output, 0)\n", + "ort_output = np.clip(ort_output, 0, 255)\n", + "ort_output = np.transpose(ort_output, [1, 2, 0]).astype(np.uint8)\n", + "cv2.imwrite(\"face_ort.png\", ort_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kE6nCBPC-cbK" + }, + "source": [ + "这段代码中,除去后处理操作外,和 ONNX Runtime 相关的代码只有三行。让我们简单解析一下这三行代码。**onnxruntime.InferenceSession** 用于获取一个 ONNX Runtime 推理器,其参数是用于推理的 ONNX 模型文件。推理器的 run 方法用于模型推理,其第一个参数为输出张量名的列表,第二个参数为输入值的字典。其中输入值字典的 key 为张量名,value 为 numpy 类型的张量值。输入输出张量的名称需要和 **torch.onnx.export** 中设置的输入输出名对应。\n", + "\n", + "如果代码正常运行的话,另一幅超分辨率照片会保存在\"face_ort.png\"中。这幅图片和刚刚得到的\"face_torch.png\"是一模一样的。这说明 ONNX Runtime 成功运行了 SRCNN 模型,模型部署完成了!以后有用户想实现超分辨率的操作,我们只需要提供一个 \"srcnn.onnx\" 文件,并帮助用户配置好 ONNX Runtime 的 Python 环境,用几行代码就可以运行模型了。或者还有更简便的方法,我们可以利用 ONNX Runtime 编译出一个可以直接执行模型的应用程序。我们只需要给用户提供 ONNX 模型文件,并让用户在应用程序选择要执行的 ONNX 模型文件名就可以运行模型了。\n", + "\n", + "# 总结\n", + "在这篇教程里,我们利用成熟的模型部署工具,轻松部署了一个初始版本的超分辨率模型 SRCNN。但在实际应用场景中,随着模型结构的复杂度不断加深,碰到的困难的也会越来越多。在下一篇教程里,我们将“升级”一下这个超分辨率模型,让它支持动态的输入。\n", + "\n", + "看完这篇教程,是不是感觉知识太多一下消化不过来?没关系,模型部署本身有非常多的东西要学。为了举例的方便,这篇教程包含了许多未来才会讲到的知识点。事实上,读完这篇教程后,记下以下知识点就够了:\n", + "\n", + "* 模型部署,指把训练好的模型在特定环境中运行的过程。模型部署要解决模型框架兼容性差和模型运行速度慢这两大问题。\n", + "* 模型部署的常见流水线是“深度学习框架-中间表示-推理引擎”。其中比较常用的一个中间表示是 ONNX。\n", + "* 深度学习模型实际上就是一个计算图。模型部署时通常把模型转换成静态的计算图,即没有控制流(分支语句、循环语句)的计算图。\n", + "* PyTorch 框架自带对 ONNX 的支持,只需要构造一组随机的输入,并对模型调用 **torch.onnx.export** 即可完成 PyTorch 到 ONNX 的转换。\n", + "* 推理引擎 ONNX Runtime 对 ONNX 模型有原生的支持。给定一个 .onnx 文件,只需要简单使用 ONNX Runtime 的 Python API 就可以完成模型推理。\n", + "\n", + "为了实现深度学习算法的落地,充满挑战的模型部署是一个逃不开的步骤。为此,我们开发的开源库 MMDeploy 实现了 OpenMMLab 中目标检测、图像分割、超分辨率等多个视觉任务模型的部署,支持 ONNX Runtime,TensorRT,ncnn ,openppl,OpenVINO 等多个推理引擎。在后续的模型部署教程中,我们将在介绍模型部署技术的同时,介绍这些技术是如何运用在 MMDeploy 中的。希望大家继续关注我们的后续教程,关注 MMDeploy,共同为深度学习算法落地贡献自己的一份力。" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "tutorials.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 8aba06d2f3f45279cd195e1d720cb15395b45a52 Mon Sep 17 00:00:00 2001 From: Chen Xin Date: Wed, 27 Apr 2022 22:31:34 +0800 Subject: [PATCH 27/51] fix mmpose api (#396) * fix mmpose api * use fmt::format instead * fix potential nullptr access --- csrc/apis/c/pose_detector.cpp | 18 ++++- csrc/apis/python/pose_detector.cpp | 67 ++++++++++++++----- .../mmpose/deploy/pose_detection_model.py | 2 +- 3 files changed, 69 insertions(+), 18 deletions(-) diff --git a/csrc/apis/c/pose_detector.cpp b/csrc/apis/c/pose_detector.cpp index acc148ee1b..08ed15be73 100644 --- a/csrc/apis/c/pose_detector.cpp +++ b/csrc/apis/c/pose_detector.cpp @@ -109,6 +109,9 @@ int mmdeploy_pose_detector_apply_bbox(mm_handle_t handle, const mm_mat_t* mats, Value img_with_boxes; if (bboxes && bbox_count) { + if (bbox_count[i] == 0) { + continue; + } for (int j = 0; j < bbox_count[i]; ++j) { Value obj; obj["ori_img"] = _mat; @@ -132,8 +135,12 @@ int mmdeploy_pose_detector_apply_bbox(mm_handle_t handle, const mm_mat_t* mats, input.front().push_back(img_with_boxes); } - auto output = pose_detector->Run(std::move(input)).value().front(); + // no box + if (result_count == 0) { + return MM_SUCCESS; + } + auto output = pose_detector->Run(std::move(input)).value().front(); auto pose_outputs = from_value>>(output); std::vector counts; @@ -152,8 +159,12 @@ int mmdeploy_pose_detector_apply_bbox(mm_handle_t handle, const mm_mat_t* mats, std::unique_ptr _results( new mm_pose_detect_t[result_count]{}, deleter); + int uid = 0; for (int i = 0; i < mat_count; ++i) { - auto& pose_output = pose_outputs[i]; + if (counts[i] == 0) { + continue; + } + auto& pose_output = pose_outputs[uid++]; for (int j = 0; j < pose_output.size(); ++j) { auto& res = _results[offsets[i] + j]; auto& box_result = pose_output[j]; @@ -181,6 +192,9 @@ int mmdeploy_pose_detector_apply_bbox(mm_handle_t handle, const mm_mat_t* mats, } void mmdeploy_pose_detector_release_result(mm_pose_detect_t* results, int count) { + if (results == nullptr) { + return; + } for (int i = 0; i < count; ++i) { delete[] results[i].point; delete[] results[i].score; diff --git a/csrc/apis/python/pose_detector.cpp b/csrc/apis/python/pose_detector.cpp index da1af9af2d..9f3c92dab3 100644 --- a/csrc/apis/python/pose_detector.cpp +++ b/csrc/apis/python/pose_detector.cpp @@ -2,11 +2,15 @@ #include "pose_detector.h" +#include + #include "common.h" #include "core/logger.h" namespace mmdeploy { +using Rect = std::array; + class PyPoseDedector { public: PyPoseDedector(const char *model_path, const char *device_name, int device_id) { @@ -17,41 +21,73 @@ class PyPoseDedector { throw std::runtime_error("failed to create pose_detector"); } } - py::list Apply(const std::vector &imgs, const std::vector> &_boxes) { + py::list Apply(const std::vector &imgs, const std::vector> &vboxes) { + if (imgs.size() == 0 && vboxes.size() == 0) { + return py::list{}; + } + if (vboxes.size() != 0 && vboxes.size() != imgs.size()) { + std::string error = + fmt::format("imgs length not equal with vboxes [{} vs {}]", imgs.size(), vboxes.size()); + throw std::invalid_argument(error); + } + std::vector mats; std::vector boxes; + std::vector bbox_count; mats.reserve(imgs.size()); for (const auto &img : imgs) { auto mat = GetMat(img); mats.push_back(mat); } - for (const auto &_box : _boxes) { - mm_rect_t box = {_box[0], _box[1], _box[2], _box[3]}; - boxes.push_back(box); + + for (auto _boxes : vboxes) { + for (auto _box : _boxes) { + mm_rect_t box = {_box[0], _box[1], _box[2], _box[3]}; + boxes.push_back(box); + } + bbox_count.push_back(_boxes.size()); + } + + // full image + if (vboxes.size() == 0) { + for (int i = 0; i < mats.size(); i++) { + mm_rect_t box = {0.f, 0.f, mats[i].width - 1, mats[i].height - 1}; + boxes.push_back(box); + bbox_count.push_back(1); + } } + mm_pose_detect_t *detection{}; - int num_box = boxes.size(); auto status = mmdeploy_pose_detector_apply_bbox(handle_, mats.data(), (int)mats.size(), - boxes.data(), &num_box, &detection); + boxes.data(), bbox_count.data(), &detection); if (status != MM_SUCCESS) { throw std::runtime_error("failed to apply pose_detector, code: " + std::to_string(status)); } + auto output = py::list{}; auto result = detection; for (int i = 0; i < mats.size(); i++) { + if (bbox_count[i] == 0) { + output.append(py::none()); + continue; + } int n_point = result->length; - auto pred = py::array_t({1, n_point, 3}); + auto pred = py::array_t({bbox_count[i], n_point, 3}); auto dst = pred.mutable_data(); - for (int j = 0; j < n_point; j++) { - dst[0] = result->point[j].x; - dst[1] = result->point[j].y; - dst[2] = result->score[j]; - dst += 3; + for (int j = 0; j < bbox_count[i]; j++) { + for (int k = 0; k < n_point; k++) { + dst[0] = result->point[k].x; + dst[1] = result->point[k].y; + dst[2] = result->score[k]; + dst += 3; + } + result++; } output.append(std::move(pred)); - result++; } - mmdeploy_pose_detector_release_result(detection, (int)mats.size()); + + int total = std::accumulate(bbox_count.begin(), bbox_count.end(), 0); + mmdeploy_pose_detector_release_result(detection, total); return output; } ~PyPoseDedector() { @@ -68,7 +104,8 @@ static void register_python_pose_detector(py::module &m) { .def(py::init([](const char *model_path, const char *device_name, int device_id) { return std::make_unique(model_path, device_name, device_id); })) - .def("__call__", &PyPoseDedector::Apply); + .def("__call__", &PyPoseDedector::Apply, py::arg("imgs"), + py::arg("vboxes") = std::vector>()); } class PythonPoseDetectorRegisterer { diff --git a/mmdeploy/codebase/mmpose/deploy/pose_detection_model.py b/mmdeploy/codebase/mmpose/deploy/pose_detection_model.py index 1844c5cc10..476284ce33 100644 --- a/mmdeploy/codebase/mmpose/deploy/pose_detection_model.py +++ b/mmdeploy/codebase/mmpose/deploy/pose_detection_model.py @@ -214,7 +214,7 @@ def forward(self, img: List[torch.Tensor], *args, **kwargs) -> list: bbox_ids.append(img_meta['bbox_id']) pred = self.wrapper.handle( - [img[0].contiguous().detach().cpu().numpy()], sdk_boxes)[0] + [img[0].contiguous().detach().cpu().numpy()], [sdk_boxes])[0] result = dict( preds=pred, From 16ee9c7843db950df37141ac24a7bdaaaa8225d4 Mon Sep 17 00:00:00 2001 From: lzhangzz Date: Tue, 3 May 2022 14:49:19 +0800 Subject: [PATCH 28/51] [Fix] support latest spdlog (#423) * support formatting `PixelFormat` & `DataType` * format enum for legacy spdlog * fix format --- csrc/core/utils/formatter.h | 58 ++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/csrc/core/utils/formatter.h b/csrc/core/utils/formatter.h index b1c2280909..5cef0f83c2 100644 --- a/csrc/core/utils/formatter.h +++ b/csrc/core/utils/formatter.h @@ -3,7 +3,11 @@ #ifndef MMDEPLOY_SRC_UTILS_FORMATTER_H_ #define MMDEPLOY_SRC_UTILS_FORMATTER_H_ +#include + #include "core/logger.h" +#include "core/types.h" +#include "spdlog/fmt/ostr.h" #if FMT_VERSION >= 50000 #include "spdlog/fmt/bundled/ranges.h" @@ -17,12 +21,55 @@ class Value; MMDEPLOY_API std::string format_value(const Value& value); +inline std::string to_string(PixelFormat format) { + switch (format) { + case PixelFormat::kBGR: + return "BGR"; + case PixelFormat::kRGB: + return "RGB"; + case PixelFormat::kGRAYSCALE: + return "GRAYSCALE"; + case PixelFormat::kNV12: + return "NV12"; + case PixelFormat::kNV21: + return "NV21"; + case PixelFormat::kBGRA: + return "BGRA"; + default: + return "invalid_format_enum"; + } +} + +inline std::string to_string(DataType type) { + switch (type) { + case DataType::kFLOAT: + return "FLOAT"; + case DataType::kHALF: + return "HALF"; + case DataType::kINT8: + return "INT8"; + case DataType::kINT32: + return "INT32"; + case DataType::kINT64: + return "INT64"; + default: + return "invalid_data_type_enum"; + } +} + +inline std::ostream& operator<<(std::ostream& os, PixelFormat format) { + return os << to_string(format); +} + +inline std::ostream& operator<<(std::ostream& os, DataType type) { return os << to_string(type); } + } // namespace mmdeploy namespace fmt { #if FMT_VERSION >= 50000 +// `Value` maybe an incomplete type at this point, making `operator<<` not usable template <> struct formatter { constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } @@ -34,21 +81,16 @@ struct formatter { #else -inline void format_arg(BasicFormatter &f, const char *, const mmdeploy::Value &d) { +inline void format_arg(BasicFormatter& f, const char*, const mmdeploy::Value& d) { f.writer() << mmdeploy::format_value(d); } -template >::value, bool> = true> -void format_arg(BasicFormatter &f, const char *, const T &v) { - f.writer() << (int)v; -} - template -auto format_arg(BasicFormatter &f, const char *, const T &v) +auto format_arg(BasicFormatter& f, const char*, const T& v) -> std::void_t { f.writer() << "["; bool first = true; - for (const auto &x : v) { + for (const auto& x : v) { f.writer() << (first ? "" : ", ") << fmt::format("{}", x); first = false; } From 86ab0637bc771a08f47f409b683a327c1b0d5da7 Mon Sep 17 00:00:00 2001 From: VVsssssk <88368822+VVsssssk@users.noreply.github.com> Date: Thu, 5 May 2022 11:11:32 +0800 Subject: [PATCH 29/51] fix pillarencode (#331) --- .../mmdet3d/deploy/voxel_detection_model.py | 4 ++-- .../codebase/mmdet3d/models/pillar_encode.py | 19 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/mmdeploy/codebase/mmdet3d/deploy/voxel_detection_model.py b/mmdeploy/codebase/mmdet3d/deploy/voxel_detection_model.py index c5696ef50a..d351b6475f 100644 --- a/mmdeploy/codebase/mmdet3d/deploy/voxel_detection_model.py +++ b/mmdeploy/codebase/mmdet3d/deploy/voxel_detection_model.py @@ -174,7 +174,7 @@ def voxelize(model_cfg: Union[str, mmcv.Config], points: torch.Tensor): @staticmethod def post_process(model_cfg: Union[str, mmcv.Config], deploy_cfg: Union[str, mmcv.Config], - outs: torch.Tensor, + outs: Dict, img_metas: Dict, device: str, rescale=False): @@ -184,7 +184,7 @@ def post_process(model_cfg: Union[str, mmcv.Config], model_cfg (str | mmcv.Config): The model config. deploy_cfg (str|mmcv.Config): Deployment config file or loaded Config object. - outs (torch.Tensor): Output of model's head. + outs (Dict): Output of model's head. img_metas(Dict): Meta info for pcd. device (str): A string specifying device type. rescale (list[torch.Tensor]): whether th rescale bbox. diff --git a/mmdeploy/codebase/mmdet3d/models/pillar_encode.py b/mmdeploy/codebase/mmdet3d/models/pillar_encode.py index 71a30647b7..23d6c8d15a 100644 --- a/mmdeploy/codebase/mmdet3d/models/pillar_encode.py +++ b/mmdeploy/codebase/mmdet3d/models/pillar_encode.py @@ -30,19 +30,18 @@ def pillar_encoder__forward(ctx, self, features, num_points, coors): # Find distance of x, y, and z from pillar center device = features.device + if self._with_voxel_center: if not self.legacy: - f_center = features[..., :3] - ( - coors * torch.tensor([1, self.vz, self.vy, self.vx]).to(device) - + - torch.tensor([1, self.z_offset, self.y_offset, self.x_offset - ]).to(device)).unsqueeze(1).flip(2)[..., :3] + f_center = features[..., :3] - (coors[..., 1:] * torch.tensor( + [self.vz, self.vy, self.vx]).to(device) + torch.tensor([ + self.z_offset, self.y_offset, self.x_offset + ]).to(device)).unsqueeze(1).flip(2) else: - f_center = features[..., :3] - ( - coors * torch.tensor([1, self.vz, self.vy, self.vx]).to(device) - + - torch.tensor([1, self.z_offset, self.y_offset, self.x_offset - ]).to(device)).unsqueeze(1).flip(2)[..., :3] + f_center = features[..., :3] - (coors[..., 1:] * torch.tensor( + [self.vz, self.vy, self.vx]).to(device) + torch.tensor([ + self.z_offset, self.y_offset, self.x_offset + ]).to(device)).unsqueeze(1).flip(2) features_ls[0] = torch.cat((f_center, features[..., 3:]), dim=-1) features_ls.append(f_center) From 5231e65f9498f46592258e80a2b1316cc93ee97e Mon Sep 17 00:00:00 2001 From: NagatoYuki0943 <72508155+NagatoYuki0943@users.noreply.github.com> Date: Sat, 7 May 2022 17:02:06 +0800 Subject: [PATCH 30/51] fix ONNXRuntime cuda test bug (#438) --- mmdeploy/codebase/mmdet/deploy/object_detection_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mmdeploy/codebase/mmdet/deploy/object_detection_model.py b/mmdeploy/codebase/mmdet/deploy/object_detection_model.py index 51f2b3cc80..34b1e8bcc6 100644 --- a/mmdeploy/codebase/mmdet/deploy/object_detection_model.py +++ b/mmdeploy/codebase/mmdet/deploy/object_detection_model.py @@ -208,6 +208,7 @@ def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], rescale = kwargs.get('rescale', True) for i in range(batch_size): dets, labels = batch_dets[i], batch_labels[i] + dets = dets.to(device=torch.device(self.device)) if rescale: scale_factor = img_metas[i]['scale_factor'] From c2f2edcf313778b7fba8c7c6e30f1e8c98aaa89f Mon Sep 17 00:00:00 2001 From: "q.yao" Date: Sat, 7 May 2022 19:31:42 +0800 Subject: [PATCH 31/51] Fix ci in master branch (#441) --- .github/workflows/build.yml | 12 +++++++----- requirements/optional.txt | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0f54134add..0f33b3b0cd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -3,14 +3,14 @@ name: build on: push: paths-ignore: - - 'demo/**' - - 'tools/**' + - "demo/**" + - "tools/**" pull_request: paths-ignore: - - 'demo/**' - - 'tools/**' - - 'docs/**' + - "demo/**" + - "tools/**" + - "docs/**" concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -78,6 +78,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: | + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev python${{matrix.python-version}}-dev apt-get clean rm -rf /var/lib/apt/lists/* @@ -122,6 +123,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: | + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev python${{matrix.python-version}}-dev apt-get clean rm -rf /var/lib/apt/lists/* diff --git a/requirements/optional.txt b/requirements/optional.txt index 8e52e01a66..327b4222cf 100644 --- a/requirements/optional.txt +++ b/requirements/optional.txt @@ -2,7 +2,7 @@ mmcls>=0.15.0,<=0.19.0 mmdet>=2.19.0,<=2.20.0 mmedit mmocr>=0.3.0,<=0.4.1 -mmpose>=0.24.0 +mmpose>=0.24.0,<=0.25.1 mmsegmentation onnxruntime>=1.8.0 openvino-dev From f45c1f09b15aaca227a91ecaed8123d308044aae Mon Sep 17 00:00:00 2001 From: HinGwenWoong Date: Sat, 7 May 2022 19:38:25 +0800 Subject: [PATCH 32/51] [Doc] Improve Jetson tutorial install doc (#381) * Improve Jetson build doc * add torchvision in the doc * Fix lint * Fix lint * Fix lint * Fix arg bug * remove incorrect process * Improve doc * Add more detail on `Conda` * Add python version detail * Install `onnx` instead of `onnxruntime` * Fix gramma * Fix gramma * Update Installation detail and fix some doc detail * Update how_to_install_mmdeploy_on_jetsons.md * Fix tensorrt and cudnn path * Improve FAQ * Improve FAQs * pplcv not switch branch since the `sm_53` missing * Update how_to_install_mmdeploy_on_jetsons.md * Update how_to_install_mmdeploy_on_jetsons.md * Update how_to_install_mmdeploy_on_jetsons.md * Update how_to_install_mmdeploy_on_jetsons.md * Improve doc * Update how_to_install_mmdeploy_on_jetsons.md * export `TENSORRT_DIR` * Using pre-build cmake to update * Improve sentence and add jetpack version * Improve sentence * move TENSORRT_DIR in the `Make TensorRT env` step * Improve CUDA detail * Update how_to_install_mmdeploy_on_jetsons.md * Update how_to_install_mmdeploy_on_jetsons.md * Improve conda installation * Improve TensorRT installation * Fix lint * Add pip crash detail and FAQ * Improve pip crash * refine the jetson installation guide * Improve python version * Improve doc, added some detail * Fix lint * Add detail for `Runtime` problem * Fix word * Update how_to_install_mmdeploy_on_jetsons.md Co-authored-by: lvhan028 --- .../how_to_install_mmdeploy_on_jetsons.md | 317 +++++++++++++----- 1 file changed, 230 insertions(+), 87 deletions(-) diff --git a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md index 1b8deb1f2f..ca5f7e3a8d 100644 --- a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md +++ b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md @@ -1,127 +1,270 @@ -## How to install mmdeploy on Jetsons +# Build for Jetson -This tutorial introduces how to install mmdeploy on Nvidia Jetson systems. It mainly introduces the installation of mmdeploy on three Jetson series boards: +In this chapter, we introduce how to install mmdeploy on NVIDIA Jetson platforms, which we have verifed on the following models: - Jetson Nano -- Jetson AGX Xavier - Jetson TX2 +- Jetson AGX Xavier -For Jetson Nano, we use Jetson Nano 2GB and install [JetPack SDK](https://developer.nvidia.com/embedded/jetpack) through SD card image method. - -### Install JetPack SDK - -There are mainly two ways to install the JetPack: -1. Write the image to the SD card directly. -2. Use the SDK Manager to do this. - -The first method does not need two separated machines and their display equipment or cables. We just follow the instruction to write the image. This is pretty convenient. Click [here](https://developer.nvidia.com/embedded/learn/get-started-jetson-nano-2gb-devkit#intro) for Jetson Nano 2GB to start. And click [here](https://developer.nvidia.com/embedded/learn/get-started-jetson-nano-devkit) for Jetson Nano 4GB to start the journey. +## Prerequisites -The second method, however, requires we set up another display tool and cable to the jetson hardware. This method is safer than the previous one as the first method may sometimes cannot write the image in and throws a warning during validation. Click [here](https://docs.nvidia.com/sdk-manager/install-with-sdkm-jetson/index.html) to start. +To equip a Jetson device, JetPack SDK is a must. +Besides, the Model Converter of MMDeploy requires an environment with PyTorch for converting PyTorch models to ONNX models. +Regarding the toolchain, cmake and gcc has to be upgraded no less than 3.14 and 7.0 respectively. -For the first method, if it always throws `Attention something went wrong...` even the file already get re-downloaded, just try `wget` to download the file and change the tail name instead. +### JetPack SDK -### Launch the system +JetPack SDK provides a full development environment for hardware-accelerated AI-at-the-edge development. +All Jetson modules and developer kits are supported by JetPack SDK. -Sometimes we just need to reboot the jetson device when it gets stuck in initializing the system. +There are two major installation methods including, +1. SD Card Image Method +2. NVIDIA SDK Manager Method -### Cuda +You can find a very detailed installation guide from NVIDIA [official website](https://developer.nvidia.com/jetpack-sdk-50dp). -The Cuda is installed by default while the cudnn is not if we use the first method. We have to write the cuda path and lib to `$PATH` and `$LD_LIBRARY_PATH`: -``` -export PATH=$PATH:/usr/local/cuda/bin -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 -``` -Then we can use `nvcc -V` the get the version of cuda we use. +Here we choose [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) as our best practice on setup Jetson platforms. -### Anaconda +### Conda -We have to install [Archiconda](https://github.com/Archiconda/build-tools/releases) instead as the Anaconda does not provide the wheel built for jetson. +Install [Archiconda](https://github.com/Archiconda/build-tools/releases) instead of Anaconda because the latter does not provide the wheel built for Jetson. +```shell +wget https://github.com/Archiconda/build-tools/releases/download/0.2.3/Archiconda3-0.2.3-Linux-aarch64.sh +bash Archiconda3-0.2.3-Linux-aarch64.sh -b -After we installed the Archiconda successfully and created the virtual env correctly. If the pip in the env does not work properly or throw `Illegal instruction (core dumped)`, we may consider re-install the pip manually, reinstalling the whole JetPack SDK is the last method we can try. +echo -e '\n# set environment variable for conda' >> ~/.bashrc +echo ". ~/archiconda3/etc/profile.d/conda.sh" >> ~/.bashrc +echo 'export PATH=$PATH:~/archiconda3/bin' >> ~/.bashrc -### Move tensorrt to conda env -After we installed the Archiconda, we can use it to create a virtual env like `mmdeploy`. Then we have to move the pre-installed tensorrt package in Jetpack to the virtual env. +echo -e '\n# set environment variable for pip' >> ~/.bashrc +echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc -First we use `find` to get where the tensorrt is -``` -sudo find / -name tensorrt -``` -Then copy the tensorrt to our destination like: +source ~/.bashrc +conda --version ``` -cp -r /usr/lib/python3.6/dist-packages/tensorrt* /home/archiconda3/env/mmdeploy/lib/python3.6/site-packages/ +After the installation, create a conda environment and activate it. +```shell +# get the version of python3 installed by default +export PYTHON_VERSION=`python3 --version | cut -d' ' -f 2 | cut -d'.' -f1,2` +conda create -y -n mmdeploy python=${PYTHON_VERSION} +conda activate mmdeploy ``` -Meanwhle, tensorrt libs like `libnvinfer.so` can be found in `LD_LIBRARY_PATH`, which is done by Jetpack as well. -### Install torch +```{note} +JetPack SDK 4+ provides python 3.6. We strongly recommend using the default python. Trying to upgrade it probably ruin the JetPack environment. -Install the PyTorch for Jetsons **specifically**. Click [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) to get the wheel. Before we use `pip install`, we have to install `libopenblas-base`, `libopenmpi-dev` first: -``` -sudo apt-get install libopenblas-base libopenmpi-dev -``` -Or, it will throw the following error when we import torch in python: -``` -libmpi_cxx.so.20: cannot open shared object file: No such file or directory +If a higher-version python is necessary, you can install JetPack 5+, in which the python version is 3.8 ``` +### PyTorch -### Install torchvision -We can't directly use `pip install torchvision` to install torchvision for Jetson Nano. But we can clone the repository from Github and build it locally. First we have to install some dependencies: -``` -sudo apt-get install libjpeg-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev -``` -Then just clone and compile the project: -``` -git clone git@github.com:pytorch/vision.git +Download the PyTorch wheel for Jetson from [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) and save it to the local directory `/opt`. +And build torchvision from source as there is no prebuilt torchvision for Jetson platforms. + +Take `torch 1.8.0` and `torchvision 0.9.0` for example. You can install them as below: +```shell +sudo apt-get install -y libopenblas-base libopenmpi-dev libjpeg-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev +pip install /opt/torch-1.8.0-cp36-cp36m-linux_aarch64.whl + +# build torchvision +git clone https://github.com/pytorch/vision.git cd vision -git checkout tags/v0.7.0 -b vision07 +git checkout tags/v0.9.0 -b v0.9.0 pip install -e . ``` -### Install mmcv +### CMake -Install openssl first: -``` -sudo apt-get install libssl-dev +We use the latest cmake v3.23.1 released in April 2022. +```shell +# purge existing +sudo apt-get purge cmake +sudo snap remove cmake + +# install prebuilt binary +export CMAKE_VER=3.23.1 +export ARCH=aarch64 +wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VER}/cmake-${CMAKE_VER}-linux-${ARCH}.sh +chmod +x cmake-${CMAKE_VER}-linux-${ARCH}.sh +sudo ./cmake-${CMAKE_VER}-linux-${ARCH}.sh --prefix=/usr --skip-license +cmake --version ``` -Then install it from source like `MMCV_WITH_OPS=1 pip install -e .` -### Update cmake +## Install Dependencies -We choose cmake version 20 as an example. -``` -sudo apt-get install -y libssl-dev -wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0.tar.gz -tar -zxvf cmake-3.20.0.tar.gz -cd cmake-3.20.0 -./bootstrap -make -sudo make install -``` -Then we can check the cmake version through: +The Model Converter of MMDeploy on Jetson platforms depends on [MMCV](https://github.com/open-mmlab/mmcv) and the inference engine [TensorRT](https://developer.nvidia.com/tensorrt). +While MMDeploy C/C++ Inference SDK relies on [spdlog](https://github.com/gabime/spdlog), OpenCV and [ppl.cv](https://github.com/openppl-public/ppl.cv) and so on as well as TensorRT. +Thus, in the following sections, we will describe how to prepare TensorRT. +And then, we will present the way to install dependencies of Model Converter and C/C++ Inference SDK respectively. + +### Prepare TensorRT + +TensorRT is already packed into JetPack SDK. But In order to import it successfully in conda environment, +we need to copy the tensorrt package to the conda environment created before. +```shell +cp -r /usr/lib/python${PYTHON_VERSION}/dist-packages/tensorrt* ~/archiconda3/envs/mmdeploy/lib/python${PYTHON_VERSION}/site-packages/ +conda deactivate +conda activate mmdeploy +python -c "import tensorrt; print(tensorrt.__version__)" # Will print the version of TensorRT + +# set environment variable for building mmdeploy later on +export TENSORRT_DIR=/usr/include/aarch64-linux-gnu + +# append cuda path and libraries to PATH and LD_LIBRARY_PATH, which is also used for building mmdeploy later on +export PATH=$PATH:/usr/local/cuda/bin +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 ``` + +You can also make the above environment variables permanent by adding them to `~/.bashrc`. + +```shell +echo -e '\n# set environment variable for TensorRT' >> ~/.bashrc +echo 'export TENSORRT_DIR=/usr/include/aarch64-linux-gnu' >> ~/.bashrc + +echo -e '\n# set environment variable for CUDA' >> ~/.bashrc +echo 'export PATH=$PATH:/usr/local/cuda/bin' >> ~/.bashrc +echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64' >> ~/.bashrc + source ~/.bashrc -cmake --version +conda activate mmdeploy ``` +### Install Dependencies for Model Converter -### Install mmdeploy -Just follow the instruction [here](../build.md). If it throws `failed building wheel for numpy...ERROR: Failed to build one or more wheels` when installing `h5py`, try install `h5py` manually. -``` -sudo apt-get install pkg-config libhdf5-100 libhdf5-dev -pip install versioned-hdf5 --no-cache-dir -``` +- Install [MMCV](https://github.com/open-mmlab/mmcv) -Then install onnx manually. First, we have to install protobuf compiler: -``` -sudo apt-get install libprotobuf-dev protobuf-compiler -``` -Then install onnx through: + MMCV hasn't provided prebuilt package for Jetson platforms, so we have to build it from source. + + ```shell + sudo apt-get install -y libssl-dev + git clone https://github.com/open-mmlab/mmcv.git + cd mmcv + git checkout v1.4.0 + MMCV_WITH_OPS=1 pip install -e . + ``` + +- Install onnx + + ```shell + pip install onnx + ``` + +- Install h5py + + Model Converter employs HDF5 to save the calibration data for TensorRT INT8 quantization. + + ```shell + sudo apt-get install -y pkg-config libhdf5-100 libhdf5-dev + pip install versioned-hdf5 + ``` + +### Install Dependencies for SDK + +You can skip this section if you don't need MMDeploy C/C++ Inference SDK. + +- Install [spdlog](https://github.com/gabime/spdlog) + + "`spdlog` is a very fast, header-only/compiled, C++ logging library" + + ```shell + sudo apt-get install -y libspdlog-dev + ``` + +- Install [ppl.cv](https://github.com/openppl-public/ppl.cv) + + "`ppl.cv` is a high-performance image processing library of [openPPL](https://openppl.ai/home)" + + ```shell + git clone https://github.com/openppl-public/ppl.cv.git + cd ppl.cv + export PPLCV_DIR=$(pwd) + echo -e '\n# set environment variable for ppl.cv' >> ~/.bashrc + echo "export PPLCV_DIR=$(pwd)" >> ~/.bashrc + ./build.sh cuda + ``` + +## Install MMDeploy + +```shell +git clone --recursive https://github.com/open-mmlab/mmdeploy.git +cd mmdeploy +export MMDEPLOY_DIR=$(pwd) ``` -pip install onnx + +### Install Model Converter + +Since some operators adopted by OpenMMLab codebases are not supported by TenorRT, +we build the custom TensorRT plugins to make it up, such as `roi_align`, `scatternd`, etc. +You can find a full list of custom plugins from [here](../ops/tensorrt.md). + +```shell +# build TensorRT custom operators +mkdir -p build && cd build +cmake .. -DMMDEPLOY_TARGET_BACKENDS="trt" +make -j$(nproc) + +# install model converter +cd ${MMDEPLOY_DIR} +pip install -v -e . +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without re-installation. ``` -Then reinstall mmdeploy. +### Install C/C++ Inference SDK + +You can skip this section if you don't need MMDeploy C/C++ Inference SDK. + +1. Build SDK Libraries + + ```shell + mkdir -p build && cd build + cmake .. \ + -DMMDEPLOY_BUILD_SDK=ON \ + -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \ + -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \ + -DMMDEPLOY_TARGET_BACKENDS="trt" \ + -DMMDEPLOY_CODEBASES=all \ + -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl + make -j$(nproc) && make install + ``` + +2. Build SDK demos + + ```shell + cd ${MMDEPLOY_DIR}/build/install/example + mkdir -p build && cd build + cmake .. -DMMDeploy_DIR=${MMDEPLOY_DIR}/build/install/lib/cmake/MMDeploy + make -j$(nproc) + ``` + +3. Run a demo + +Take the object detection for example: + ```shell + ./object_detection cuda ${directory/to/the/converted/models} ${path/to/an/image} + ``` + +## Troubleshooting + +### Installation + +- `pip install` throws an error like `Illegal instruction (core dumped)` + + ```shell + echo '# set env for pip' >> ~/.bashrc + echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc + source ~/.bashrc + ``` + + If steps above don't work, check if you are using any mirror, if you did, try this: + ```shell + rm .condarc + conda clean -i + conda create -n xxx python=${PYTHON_VERSION} + ``` -### FAQs +### Runtime -- For Jetson TX2 and Jetson Nano, `#assertion/root/workspace/mmdeploy/csrc/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp,98` or `pre_top_k need to be reduced for devices with arch 7.2` +- `#assertion/root/workspace/mmdeploy/csrc/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp,98` or `pre_top_k need to be reduced for devices with arch 7.2` - Set MAX N mode and `sudo nvpmodel -m 0 && sudo jetson_clocks`. - Reducing the number of [pre_top_k](https://github.com/open-mmlab/mmdeploy/blob/34879e638cc2db511e798a376b9a4b9932660fe1/configs/mmdet/_base_/base_static.py#L13) to reduce the number of proposals may resolve the problem. + 1. Set `MAX N` mode and perform `sudo nvpmodel -m 0 && sudo jetson_clocks`. + 2. Reduce the number of `pre_top_k` in deploy config file like [mmdet pre_top_k](https://github.com/open-mmlab/mmdeploy/blob/34879e638cc2db511e798a376b9a4b9932660fe1/configs/mmdet/_base_/base_static.py#L13) does, e.g., `1000`. + 3. Convert the model again and try SDK demo again. From 94148cbe56407596c6d81f43d1bb63d3fa6e0743 Mon Sep 17 00:00:00 2001 From: Johannes L Date: Mon, 9 May 2022 16:22:00 +0200 Subject: [PATCH 33/51] Version comments added, torch install steps added. (#449) --- .../how_to_install_mmdeploy_on_jetsons.md | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md index ca5f7e3a8d..59733bba15 100644 --- a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md +++ b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md @@ -22,7 +22,7 @@ There are two major installation methods including, You can find a very detailed installation guide from NVIDIA [official website](https://developer.nvidia.com/jetpack-sdk-50dp). -Here we choose [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) as our best practice on setup Jetson platforms. +Here we choose [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) as our best practice on setup Jetson platforms. MMDeploy has been tested on JetPack 4.6 rev3 and above and TensorRT 8.0.1.6 and above. Earlier JetPack versions has incompatibilities with TensorRT 7.x ### Conda @@ -59,18 +59,23 @@ If a higher-version python is necessary, you can install JetPack 5+, in which th Download the PyTorch wheel for Jetson from [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) and save it to the local directory `/opt`. And build torchvision from source as there is no prebuilt torchvision for Jetson platforms. -Take `torch 1.8.0` and `torchvision 0.9.0` for example. You can install them as below: +Take `torch 1.10.0` and `torchvision 0.11.1` for example. You can install them as below: ```shell -sudo apt-get install -y libopenblas-base libopenmpi-dev libjpeg-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev -pip install /opt/torch-1.8.0-cp36-cp36m-linux_aarch64.whl - -# build torchvision -git clone https://github.com/pytorch/vision.git -cd vision -git checkout tags/v0.9.0 -b v0.9.0 +# pytorch +wget https://nvidia.box.com/shared/static/fjtbno0vpo676a25cgvuqc1wty0fkkg6.whl -O torch-1.10.0-cp36-cp36m-linux_aarch64.whl +pip3 install torch-1.10.0-cp36-cp36m-linux_aarch64.whl +# torchvision +sudo apt-get install libjpeg-dev zlib1g-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev -y +sudo rm -r torchvision +git clone https://github.com/pytorch/vision torchvision +cd torchvision +git checkout tags/v0.11.1 -b v0.11.1 +export BUILD_VERSION=0.11.1 pip install -e . ``` +If you install other versions of PyTorch and torchvision, make sure the versions are compatible. Refer to the compatibility chart listed [here](https://pypi.org/project/torchvision/). + ### CMake We use the latest cmake v3.23.1 released in April 2022. From 37868566603b363b2f77da369c279e28461de123 Mon Sep 17 00:00:00 2001 From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Date: Mon, 9 May 2022 22:27:19 +0800 Subject: [PATCH 34/51] [Docs] Fix API documentation (#443) * [Docs] Fix API documentation * add onnx dependency in readthedocs.txt * fix dependencies --- docs/en/conf.py | 4 +++- docs/zh_cn/conf.py | 4 +++- requirements/readthedocs.txt | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/en/conf.py b/docs/en/conf.py index 1d0df3ea74..ba34088b38 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -19,7 +19,7 @@ from recommonmark.transform import AutoStructify from sphinx.builders.html import StandaloneHTMLBuilder -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('../..')) version_file = '../../mmdeploy/version.py' with open(version_file, 'r') as f: @@ -57,6 +57,8 @@ 'sphinx_copybutton', ] # yapf: disable +autodoc_mock_imports = ['tensorrt'] + autosectionlabel_prefix_document = True # Add any paths that contain templates here, relative to this directory. diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py index 8d85c0e14f..093ae2ee9e 100644 --- a/docs/zh_cn/conf.py +++ b/docs/zh_cn/conf.py @@ -19,7 +19,7 @@ from recommonmark.transform import AutoStructify from sphinx.builders.html import StandaloneHTMLBuilder -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('../..')) version_file = '../../mmdeploy/version.py' with open(version_file, 'r') as f: @@ -57,6 +57,8 @@ 'sphinx_copybutton', ] # yapf: disable +autodoc_mock_imports = ['tensorrt'] + autosectionlabel_prefix_document = True # Add any paths that contain templates here, relative to this directory. diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt index aae7aa05c8..a4517b1331 100644 --- a/requirements/readthedocs.txt +++ b/requirements/readthedocs.txt @@ -1,2 +1,4 @@ +h5py mmcv +onnx>=1.8.0 torch From 0cd44a6799ec168f885b4ef5b776fb135740487d Mon Sep 17 00:00:00 2001 From: hanrui1sensetime <83800577+hanrui1sensetime@users.noreply.github.com> Date: Thu, 12 May 2022 12:00:57 +0800 Subject: [PATCH 35/51] [Fix] Fix display bugs for windows (#451) * fix issue 330 for windows * fix code * fix lint * fix all platform --- tools/deploy.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/deploy.py b/tools/deploy.py index ca835a8611..330c7c2f08 100644 --- a/tools/deploy.py +++ b/tools/deploy.py @@ -267,10 +267,17 @@ def main(): if args.test_img is None: args.test_img = args.img - import os - is_display = os.getenv('DISPLAY') + + headless = False + # check headless or not for all platforms. + import tkinter + try: + tkinter.Tk() + except Exception: + headless = True + # for headless installation. - if is_display is not None: + if not headless: # visualize model of the backend create_process( f'visualize {backend.value} model', From 21c2a85721212088c75235a83e3f23b68a76aec4 Mon Sep 17 00:00:00 2001 From: chaoqun Date: Mon, 16 May 2022 15:39:34 +0800 Subject: [PATCH 36/51] [Docs] Minor fixes and translation of installation tutorial for Jetson (#415) * minor fixes * add Jetson installation * updated zh_cn based on new en version --- .../how_to_install_mmdeploy_on_jetsons.md | 85 +++--- .../how_to_install_mmdeploy_on_jetsons.md | 282 ++++++++++++++++++ 2 files changed, 328 insertions(+), 39 deletions(-) create mode 100644 docs/zh_cn/tutorials/how_to_install_mmdeploy_on_jetsons.md diff --git a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md index 59733bba15..e741250365 100644 --- a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md +++ b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md @@ -1,15 +1,15 @@ # Build for Jetson -In this chapter, we introduce how to install mmdeploy on NVIDIA Jetson platforms, which we have verifed on the following models: +In this chapter, we introduce how to install MMDeploy on NVIDIA Jetson platforms, which we have verified on the following modules: - Jetson Nano - Jetson TX2 - Jetson AGX Xavier ## Prerequisites -To equip a Jetson device, JetPack SDK is a must. +To equip a Jetson device, the JetPack SDK is a must. Besides, the Model Converter of MMDeploy requires an environment with PyTorch for converting PyTorch models to ONNX models. -Regarding the toolchain, cmake and gcc has to be upgraded no less than 3.14 and 7.0 respectively. +Regarding the toolchain, CMake and GCC has to be upgraded to no less than 3.14 and 7.0 respectively. ### JetPack SDK @@ -27,6 +27,7 @@ Here we choose [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) as ### Conda Install [Archiconda](https://github.com/Archiconda/build-tools/releases) instead of Anaconda because the latter does not provide the wheel built for Jetson. + ```shell wget https://github.com/Archiconda/build-tools/releases/download/0.2.3/Archiconda3-0.2.3-Linux-aarch64.sh bash Archiconda3-0.2.3-Linux-aarch64.sh -b @@ -41,7 +42,9 @@ echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc source ~/.bashrc conda --version ``` + After the installation, create a conda environment and activate it. + ```shell # get the version of python3 installed by default export PYTHON_VERSION=`python3 --version | cut -d' ' -f 2 | cut -d'.' -f1,2` @@ -50,9 +53,9 @@ conda activate mmdeploy ``` ```{note} -JetPack SDK 4+ provides python 3.6. We strongly recommend using the default python. Trying to upgrade it probably ruin the JetPack environment. +JetPack SDK 4+ provides python 3.6. We strongly recommend using the default python. Trying to upgrade it probably will ruin the JetPack environment. -If a higher-version python is necessary, you can install JetPack 5+, in which the python version is 3.8 +If a higher-version python is necessary, you can install JetPack 5+, in which the python version is 3.8. ``` ### PyTorch @@ -60,6 +63,7 @@ Download the PyTorch wheel for Jetson from [here](https://forums.developer.nvidi And build torchvision from source as there is no prebuilt torchvision for Jetson platforms. Take `torch 1.10.0` and `torchvision 0.11.1` for example. You can install them as below: + ```shell # pytorch wget https://nvidia.box.com/shared/static/fjtbno0vpo676a25cgvuqc1wty0fkkg6.whl -O torch-1.10.0-cp36-cp36m-linux_aarch64.whl @@ -79,6 +83,7 @@ If you install other versions of PyTorch and torchvision, make sure the versions ### CMake We use the latest cmake v3.23.1 released in April 2022. + ```shell # purge existing sudo apt-get purge cmake @@ -96,14 +101,15 @@ cmake --version ## Install Dependencies The Model Converter of MMDeploy on Jetson platforms depends on [MMCV](https://github.com/open-mmlab/mmcv) and the inference engine [TensorRT](https://developer.nvidia.com/tensorrt). -While MMDeploy C/C++ Inference SDK relies on [spdlog](https://github.com/gabime/spdlog), OpenCV and [ppl.cv](https://github.com/openppl-public/ppl.cv) and so on as well as TensorRT. +While MMDeploy C/C++ Inference SDK relies on [spdlog](https://github.com/gabime/spdlog), OpenCV and [ppl.cv](https://github.com/openppl-public/ppl.cv) and so on, as well as TensorRT. Thus, in the following sections, we will describe how to prepare TensorRT. And then, we will present the way to install dependencies of Model Converter and C/C++ Inference SDK respectively. ### Prepare TensorRT -TensorRT is already packed into JetPack SDK. But In order to import it successfully in conda environment, +TensorRT is already packed into JetPack SDK. However, in order to import it successfully in the conda environment, we need to copy the tensorrt package to the conda environment created before. + ```shell cp -r /usr/lib/python${PYTHON_VERSION}/dist-packages/tensorrt* ~/archiconda3/envs/mmdeploy/lib/python${PYTHON_VERSION}/site-packages/ conda deactivate @@ -131,6 +137,7 @@ echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64' >> ~/.bashr source ~/.bashrc conda activate mmdeploy ``` + ### Install Dependencies for Model Converter - Install [MMCV](https://github.com/open-mmlab/mmcv) @@ -145,7 +152,7 @@ conda activate mmdeploy MMCV_WITH_OPS=1 pip install -e . ``` -- Install onnx +- Install ONNX ```shell pip install onnx @@ -174,7 +181,7 @@ You can skip this section if you don't need MMDeploy C/C++ Inference SDK. - Install [ppl.cv](https://github.com/openppl-public/ppl.cv) - "`ppl.cv` is a high-performance image processing library of [openPPL](https://openppl.ai/home)" + "`ppl.cv` is a high-performance image processing library of [OpenPPL](https://openppl.ai/home)" ```shell git clone https://github.com/openppl-public/ppl.cv.git @@ -219,33 +226,33 @@ You can skip this section if you don't need MMDeploy C/C++ Inference SDK. 1. Build SDK Libraries - ```shell - mkdir -p build && cd build - cmake .. \ - -DMMDEPLOY_BUILD_SDK=ON \ - -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \ - -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \ - -DMMDEPLOY_TARGET_BACKENDS="trt" \ - -DMMDEPLOY_CODEBASES=all \ - -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl - make -j$(nproc) && make install - ``` + ```shell + mkdir -p build && cd build + cmake .. \ + -DMMDEPLOY_BUILD_SDK=ON \ + -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \ + -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \ + -DMMDEPLOY_TARGET_BACKENDS="trt" \ + -DMMDEPLOY_CODEBASES=all \ + -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl + make -j$(nproc) && make install + ``` 2. Build SDK demos - ```shell - cd ${MMDEPLOY_DIR}/build/install/example - mkdir -p build && cd build - cmake .. -DMMDeploy_DIR=${MMDEPLOY_DIR}/build/install/lib/cmake/MMDeploy - make -j$(nproc) - ``` + ```shell + cd ${MMDEPLOY_DIR}/build/install/example + mkdir -p build && cd build + cmake .. -DMMDeploy_DIR=${MMDEPLOY_DIR}/build/install/lib/cmake/MMDeploy + make -j$(nproc) + ``` 3. Run a demo -Take the object detection for example: - ```shell - ./object_detection cuda ${directory/to/the/converted/models} ${path/to/an/image} - ``` + Take the object detection for example: + ```shell + ./object_detection cuda ${directory/to/the/converted/models} ${path/to/an/image} + ``` ## Troubleshooting @@ -254,17 +261,17 @@ Take the object detection for example: - `pip install` throws an error like `Illegal instruction (core dumped)` ```shell - echo '# set env for pip' >> ~/.bashrc - echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc - source ~/.bashrc - ``` + echo '# set env for pip' >> ~/.bashrc + echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc + source ~/.bashrc + ``` - If steps above don't work, check if you are using any mirror, if you did, try this: + If the steps above don't work, check if you are using any mirror. If so, try this: ```shell - rm .condarc - conda clean -i - conda create -n xxx python=${PYTHON_VERSION} - ``` + rm .condarc + conda clean -i + conda create -n xxx python=${PYTHON_VERSION} + ``` ### Runtime diff --git a/docs/zh_cn/tutorials/how_to_install_mmdeploy_on_jetsons.md b/docs/zh_cn/tutorials/how_to_install_mmdeploy_on_jetsons.md new file mode 100644 index 0000000000..d500ec3d37 --- /dev/null +++ b/docs/zh_cn/tutorials/how_to_install_mmdeploy_on_jetsons.md @@ -0,0 +1,282 @@ +# 如何在 Jetson 模组上安装 MMDeploy + +本教程将介绍如何在 NVIDIA Jetson 平台上安装 MMDeploy。该方法已经在以下 3 种 Jetson 模组上进行了验证: +- Jetson Nano +- Jetson TX2 +- Jetson AGX Xavier + +## 预备 + +首先需要在 Jetson 模组上安装 JetPack SDK。 +此外,在利用 MMDeploy 的 Model Converter 转换 PyTorch 模型为 ONNX 模型时,需要创建一个装有 PyTorch 的环境。 +最后,关于编译工具链,要求 CMake 和 GCC 的版本分别不低于 3.14 和 7.0。 + +### JetPack SDK + +JetPack SDK 为构建硬件加速的边缘 AI 应用提供了一个全面的开发环境。 +其支持所有的 Jetson 模组及开发套件。 + +主要有两种安装 JetPack SDK 的方式: +1. 使用 SD 卡镜像方式,直接将镜像刻录到 SD 卡上 +2. 使用 NVIDIA SDK Manager 进行安装 + +你可以在 NVIDIA [官网](https://developer.nvidia.com/jetpack-sdk-50dp)上找到详细的安装指南。 + +这里我们选择 [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) 作为装配 Jetson 模组的首选。MMDeploy 已经在 JetPack 4.6 rev3 及以上版本,TensorRT 8.0.1.6 及以上版本进行了测试。更早的 JetPack 版本与 TensorRT 7.x 存在不兼容的情况。 + +### Conda + +安装 [Archiconda](https://github.com/Archiconda/build-tools/releases) 而不是 Anaconda,因为后者不提供针对 Jetson 的 wheel 文件。 + +```shell +wget https://github.com/Archiconda/build-tools/releases/download/0.2.3/Archiconda3-0.2.3-Linux-aarch64.sh +bash Archiconda3-0.2.3-Linux-aarch64.sh -b + +echo -e '\n# set environment variable for conda' >> ~/.bashrc +echo ". ~/archiconda3/etc/profile.d/conda.sh" >> ~/.bashrc +echo 'export PATH=$PATH:~/archiconda3/bin' >> ~/.bashrc + +echo -e '\n# set environment variable for pip' >> ~/.bashrc +echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc + +source ~/.bashrc +conda --version +``` + +完成安装后需创建并启动一个 conda 环境。 + +```shell +# 得到默认安装的 python3 版本 +export PYTHON_VERSION=`python3 --version | cut -d' ' -f 2 | cut -d'.' -f1,2` +conda create -y -n mmdeploy python=${PYTHON_VERSION} +conda activate mmdeploy +``` + +```{note} +JetPack SDK 4+ 自带 python 3.6。我们强烈建议使用默认的 python 版本。尝试升级 python 可能会破坏 JetPack 环境。 + +如果必须安装更高版本的 python, 可以选择安装 JetPack 5+,其提供 python 3.8。 +``` +### PyTorch + +从[这里](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048)下载 Jetson 的 PyTorch wheel 文件并保存在本地目录 `/opt` 中。 +此外,由于 torchvision 不提供针对 Jetson 平台的预编译包,因此需要从源码进行编译。 + +以 `torch 1.10.0` 和 `torchvision 0.11.1` 为例,可按以下方式进行安装: + +```shell +# pytorch +wget https://nvidia.box.com/shared/static/fjtbno0vpo676a25cgvuqc1wty0fkkg6.whl -O torch-1.10.0-cp36-cp36m-linux_aarch64.whl +pip3 install torch-1.10.0-cp36-cp36m-linux_aarch64.whl +# torchvision +sudo apt-get install libjpeg-dev zlib1g-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev -y +sudo rm -r torchvision +git clone https://github.com/pytorch/vision torchvision +cd torchvision +git checkout tags/v0.11.1 -b v0.11.1 +export BUILD_VERSION=0.11.1 +pip install -e . +``` + +如果安装其他版本的 PyTorch 和 torchvision,需参考[这里](https://pypi.org/project/torchvision/)的表格以保证版本兼容性。 + +### CMake + +这里我们使用 CMake 截至2022年4月的最新版本 v3.23.1。 + +```shell +# purge existing +sudo apt-get purge cmake +sudo snap remove cmake + +# install prebuilt binary +export CMAKE_VER=3.23.1 +export ARCH=aarch64 +wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VER}/cmake-${CMAKE_VER}-linux-${ARCH}.sh +chmod +x cmake-${CMAKE_VER}-linux-${ARCH}.sh +sudo ./cmake-${CMAKE_VER}-linux-${ARCH}.sh --prefix=/usr --skip-license +cmake --version +``` + +## 安装依赖项 + +MMDeploy 中的 Model Converter 依赖于 [MMCV](https://github.com/open-mmlab/mmcv) 和推理引擎 [TensorRT](https://developer.nvidia.com/tensorrt)。 +同时, MMDeploy 的 C/C++ Inference SDK 依赖于 [spdlog](https://github.com/gabime/spdlog), OpenCV, [ppl.cv](https://github.com/openppl-public/ppl.cv) 和 TensorRT 等。 +因此,接下来我们将先介绍如何配置 TensorRT。 +之后再分别展示安装 Model Converter 和 C/C++ Inference SDK 的步骤。 + +### 配置 TensorRT + +JetPack SDK 自带 TensorRT。 +但是为了能够在 Conda 环境中成功导入,我们需要将 TensorRT 拷贝进先前创建的 Conda 环境中。 + +```shell +cp -r /usr/lib/python${PYTHON_VERSION}/dist-packages/tensorrt* ~/archiconda3/envs/mmdeploy/lib/python${PYTHON_VERSION}/site-packages/ +conda deactivate +conda activate mmdeploy +python -c "import tensorrt; print(tensorrt.__version__)" # 将会打印出 TensorRT 版本 + +# 为之后编译 MMDeploy 设置环境变量 +export TENSORRT_DIR=/usr/include/aarch64-linux-gnu + +# 将 cuda 路径和 lib 路径写入到环境变量 `$PATH` 和 `$LD_LIBRARY_PATH` 中, 为之后编译 MMDeploy 做准备 +export PATH=$PATH:/usr/local/cuda/bin +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 +``` + +你也可以通过添加以上环境变量至 `~/.bashrc` 使得它们永久化。 + +```shell +echo -e '\n# set environment variable for TensorRT' >> ~/.bashrc +echo 'export TENSORRT_DIR=/usr/include/aarch64-linux-gnu' >> ~/.bashrc + +echo -e '\n# set environment variable for CUDA' >> ~/.bashrc +echo 'export PATH=$PATH:/usr/local/cuda/bin' >> ~/.bashrc +echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64' >> ~/.bashrc + +source ~/.bashrc +conda activate mmdeploy +``` + +### 安装 Model Converter 的依赖项 + +- 安装 [MMCV](https://github.com/open-mmlab/mmcv) + + MMCV 还未提供针对 Jetson 平台的预编译包,因此我们需要从源对其进行编译。 + + ```shell + sudo apt-get install -y libssl-dev + git clone https://github.com/open-mmlab/mmcv.git + cd mmcv + git checkout v1.4.0 + MMCV_WITH_OPS=1 pip install -e . + ``` + +- 安装 ONNX + + ```shell + pip install onnx + ``` + +- 安装 h5py + + Model Converter 使用 HDF5 存储 TensorRT INT8 量化的校准数据。 + + ```shell + sudo apt-get install -y pkg-config libhdf5-100 libhdf5-dev + pip install versioned-hdf5 + ``` + +### 安装 SDK 的依赖项 + +如果你不需要使用 MMDeploy C/C++ Inference SDK 则可以跳过本步骤。 + +- 安装 [spdlog](https://github.com/gabime/spdlog) + + “`spdlog` 是一个快速的,仅有头文件的 C++ 日志库。” + + ```shell + sudo apt-get install -y libspdlog-dev + ``` + +- 安装 [ppl.cv](https://github.com/openppl-public/ppl.cv) + + “`ppl.cv` 是 [OpenPPL](https://openppl.ai/home) 的高性能图像处理库。” + + ```shell + git clone https://github.com/openppl-public/ppl.cv.git + cd ppl.cv + export PPLCV_DIR=$(pwd) + echo -e '\n# set environment variable for ppl.cv' >> ~/.bashrc + echo "export PPLCV_DIR=$(pwd)" >> ~/.bashrc + ./build.sh cuda + ``` + +## 安装 MMDeploy + +```shell +git clone --recursive https://github.com/open-mmlab/mmdeploy.git +cd mmdeploy +export MMDEPLOY_DIR=$(pwd) +``` + +### 安装 Model Converter + +由于一些算子采用的是 OpenMMLab 代码库中的实现,并不被 TenorRT 支持, +因此我们需要自定义 TensorRT 插件,例如 `roi_align`, `scatternd` 等。 +你可以从[这里](../ops/tensorrt.md)找到完整的自定义插件列表。 + +```shell +# 编译 TensorRT 自定义算子 +mkdir -p build && cd build +cmake .. -DMMDEPLOY_TARGET_BACKENDS="trt" +make -j$(nproc) + +# 安装 model converter +cd ${MMDEPLOY_DIR} +pip install -v -e . +# "-v" 表示显示详细安装信息 +# "-e" 表示在可编辑模式下安装 +# 因此任何针对代码的本地修改都可以在无需重装的情况下生效。 +``` + +### 安装 C/C++ Inference SDK + +如果你不需要使用 MMDeploy C/C++ Inference SDK 则可以跳过本步骤。 + +1. 编译 SDK Libraries + + ```shell + mkdir -p build && cd build + cmake .. \ + -DMMDEPLOY_BUILD_SDK=ON \ + -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \ + -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \ + -DMMDEPLOY_TARGET_BACKENDS="trt" \ + -DMMDEPLOY_CODEBASES=all \ + -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl + make -j$(nproc) && make install + ``` + +2. 编译 SDK demos + + ```shell + cd ${MMDEPLOY_DIR}/build/install/example + mkdir -p build && cd build + cmake .. -DMMDeploy_DIR=${MMDEPLOY_DIR}/build/install/lib/cmake/MMDeploy + make -j$(nproc) + ``` + +3. 运行 demo + + 以目标检测为例: + ```shell + ./object_detection cuda ${directory/to/the/converted/models} ${path/to/an/image} + ``` + +## Troubleshooting + +### 安装 + +- `pip install` 报错 `Illegal instruction (core dumped)` + + ```shell + echo '# set env for pip' >> ~/.bashrc + echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc + source ~/.bashrc + ``` + + 如果上述方法仍无法解决问题,检查是否正在使用镜像文件。如果是的,可尝试: + ```shell + rm .condarc + conda clean -i + conda create -n xxx python=${PYTHON_VERSION} + ``` + +### 执行 + +- `#assertion/root/workspace/mmdeploy/csrc/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp,98` or `pre_top_k need to be reduced for devices with arch 7.2` + + 1. 设置为 `MAX N` 模式并执行 `sudo nvpmodel -m 0 && sudo jetson_clocks`。 + 2. 效仿 [mmdet pre_top_k](https://github.com/open-mmlab/mmdeploy/blob/34879e638cc2db511e798a376b9a4b9932660fe1/configs/mmdet/_base_/base_static.py#L13),减少配置文件中 `pre_top_k` 的个数,例如 `1000`。 + 3. 重新进行模型转换并重新运行 demo。 From 2f2ec2728e11d658f603b0998e0000f6be0389a1 Mon Sep 17 00:00:00 2001 From: Johannes L Date: Tue, 17 May 2022 13:55:47 +0200 Subject: [PATCH 37/51] =?UTF-8?q?If=20a=20cuda=20launch=20error=20occurs,?= =?UTF-8?q?=20verify=20if=20cuda=20device=20requires=20top=5Fk=20t?= =?UTF-8?q?=E2=80=A6=20(#479)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * If a cuda launch error occurs, verify if cuda device requires top_k to be reduced. * Fixed lint * Clang format * Fixed lint, clang-format --- .../tensorrt/batched_nms/allClassNMS.cu | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/csrc/backend_ops/tensorrt/batched_nms/allClassNMS.cu b/csrc/backend_ops/tensorrt/batched_nms/allClassNMS.cu index dab871b2a9..d16652fab9 100644 --- a/csrc/backend_ops/tensorrt/batched_nms/allClassNMS.cu +++ b/csrc/backend_ops/tensorrt/batched_nms/allClassNMS.cu @@ -212,7 +212,19 @@ pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array, (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array, flipXY); - CSC(cudaGetLastError(), STATUS_FAILURE); + cudaError_t code = cudaGetLastError(); + if (code != cudaSuccess) { + // Verify if cuda dev0 requires top_k to be reduced; + // sm_53 (Jetson Nano) and sm_62 (Jetson TX2) requires reduced top_k < 1000 + auto __cuda_arch__ = get_cuda_arch(0); + if ((__cuda_arch__ == 530 || __cuda_arch__ == 620) && top_k >= 1000) { + printf( + "Warning: pre_top_k need to be reduced for devices with arch 5.3, 6.2, got " + "pre_top_k=%d\n", + top_k); + } + return STATUS_FAILURE; + } return STATUS_SUCCESS; } @@ -250,11 +262,6 @@ pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_cla const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX, void *bbox_data, void *beforeNMS_scores, void *beforeNMS_index_array, void *afterNMS_scores, void *afterNMS_index_array, bool flipXY) { - auto __cuda_arch__ = get_cuda_arch(0); // assume there is only one arch 7.2 device - if (__cuda_arch__ == 720 && top_k >= 1000) { - printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n", - top_k); - } nmsLaunchConfigSSD lc = nmsLaunchConfigSSD(DT_SCORE, DT_BBOX, allClassNMS_gpu); for (unsigned i = 0; i < nmsFuncVec.size(); ++i) { if (lc == nmsFuncVec[i]) { From ba641c3b23e177eaf10d94ae9d811bbd4af5ac52 Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Tue, 17 May 2022 19:57:12 +0800 Subject: [PATCH 38/51] [Fix] set optional arg a default value (#483) * optional default value * resolve comments Co-authored-by: dongchunyu.vendor --- tools/deploy.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/deploy.py b/tools/deploy.py index 330c7c2f08..e348aa7090 100644 --- a/tools/deploy.py +++ b/tools/deploy.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import logging +import os import os.path as osp from functools import partial @@ -26,7 +27,10 @@ def parse_args(): parser.add_argument('img', help='image used to convert model model') parser.add_argument( '--test-img', default=None, help='image used to test model') - parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--work-dir', + default=os.getcwd(), + help='the dir to save logs and models') parser.add_argument( '--calib-dataset-cfg', help='dataset config path used to calibrate in int8 mode. If not \ From 69111a6b956fd147d1590f0b808e545cebf45eeb Mon Sep 17 00:00:00 2001 From: Lakshantha Dissanayake Date: Tue, 17 May 2022 18:10:44 +0530 Subject: [PATCH 39/51] Update: Optimize document (#484) * Update: Optimize document - Minor fixes in styling and grammar - Add support for Jetson Xavier NX (Tested and worked) - Add hardware recommendation - Change JetPack installation guide URL from jp5.0 to jp4.6.1 - Add a note to select "Jetson SDK Components" when using NVIDIA SDK Manager - Change PyTorch wheel save location - Add more dependencies needed for torchvision installation. Otherwise installation error - Simplify torchvision git cloning branch - Add installation times for torchvision, MMCV, versioned-hdf5, ppl.cv, model converter, SDK libraries - Delete "snap" from cmake removal as "apt-get purge" is enough - Add a note on which scenarios you need to append cu da path and libraries to PATH and LD_LIBRARY_PATH - Simplify MMCV git cloning branch - Delete "skip if you don't need MMDeploy C/C++ Inference SDK", because that is the only available inference SDK at the moment - Add more details to object detection demo using C/C++ Inference SDK such as installing MMDetection and converting a model - Add image of inference result - Delete "set env for pip" in troubleshooting because this is already mentioned under "installing Archiconda" Signed-off-by: Lakshantha Dissanayake * Fix: note style on doc * Fix: Trim trailing whitespaces * Update: add source image before inference --- .../how_to_install_mmdeploy_on_jetsons.md | 237 +++++++++++------- 1 file changed, 150 insertions(+), 87 deletions(-) diff --git a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md index e741250365..b36a81ed18 100644 --- a/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md +++ b/docs/en/tutorials/how_to_install_mmdeploy_on_jetsons.md @@ -1,15 +1,22 @@ # Build for Jetson In this chapter, we introduce how to install MMDeploy on NVIDIA Jetson platforms, which we have verified on the following modules: + - Jetson Nano +- Jetson Xavier NX - Jetson TX2 - Jetson AGX Xavier +Hardware recommendation: + +- [Seeed reComputer built with Jetson Nano module](https://www.seeedstudio.com/Jetson-10-1-A0-p-5336.html) +- [Seeed reComputer built with Jetson Xavier NX module](https://www.seeedstudio.com/Jetson-20-1-H1-p-5328.html) + ## Prerequisites -To equip a Jetson device, the JetPack SDK is a must. -Besides, the Model Converter of MMDeploy requires an environment with PyTorch for converting PyTorch models to ONNX models. -Regarding the toolchain, CMake and GCC has to be upgraded to no less than 3.14 and 7.0 respectively. +- To equip a Jetson device, JetPack SDK is a must. +- The Model Converter of MMDeploy requires an environment with PyTorch for converting PyTorch models to ONNX models. +- Regarding the toolchain, CMake and GCC has to be upgraded to no less than 3.14 and 7.0 respectively. ### JetPack SDK @@ -20,9 +27,13 @@ There are two major installation methods including, 1. SD Card Image Method 2. NVIDIA SDK Manager Method -You can find a very detailed installation guide from NVIDIA [official website](https://developer.nvidia.com/jetpack-sdk-50dp). +You can find a very detailed installation guide from NVIDIA [official website](https://developer.nvidia.com/jetpack-sdk-461). + +```{note} +Please select the option to install "Jetson SDK Components" when using NVIDIA SDK Manager as this includes CUDA and TensorRT which are needed for this guide. +``` -Here we choose [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) as our best practice on setup Jetson platforms. MMDeploy has been tested on JetPack 4.6 rev3 and above and TensorRT 8.0.1.6 and above. Earlier JetPack versions has incompatibilities with TensorRT 7.x +Here we have chosen [JetPack 4.6.1](https://developer.nvidia.com/jetpack-sdk-461) as our best practice on setting up Jetson platforms. MMDeploy has been tested on JetPack 4.6 (rev.3) and above and TensorRT 8.0.1.6 and above. Earlier JetPack versions has incompatibilities with TensorRT 7.x ### Conda @@ -53,14 +64,14 @@ conda activate mmdeploy ``` ```{note} -JetPack SDK 4+ provides python 3.6. We strongly recommend using the default python. Trying to upgrade it probably will ruin the JetPack environment. +JetPack SDK 4+ provides Python 3.6. We strongly recommend using the default Python. Trying to upgrade it will probably ruin the JetPack environment. -If a higher-version python is necessary, you can install JetPack 5+, in which the python version is 3.8. +If a higher-version of Python is necessary, you can install JetPack 5+, in which the Python version is 3.8. ``` + ### PyTorch -Download the PyTorch wheel for Jetson from [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-10-now-available/72048) and save it to the local directory `/opt`. -And build torchvision from source as there is no prebuilt torchvision for Jetson platforms. +Download the PyTorch wheel for Jetson from [here](https://forums.developer.nvidia.com/t/pytorch-for-jetson-version-1-11-now-available/72048) and save it to the `/home/username` directory. Build torchvision from source as there is no prebuilt torchvision for Jetson platforms. Take `torch 1.10.0` and `torchvision 0.11.1` for example. You can install them as below: @@ -68,16 +79,19 @@ Take `torch 1.10.0` and `torchvision 0.11.1` for example. You can install them # pytorch wget https://nvidia.box.com/shared/static/fjtbno0vpo676a25cgvuqc1wty0fkkg6.whl -O torch-1.10.0-cp36-cp36m-linux_aarch64.whl pip3 install torch-1.10.0-cp36-cp36m-linux_aarch64.whl + # torchvision -sudo apt-get install libjpeg-dev zlib1g-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev -y -sudo rm -r torchvision -git clone https://github.com/pytorch/vision torchvision +sudo apt-get install libjpeg-dev zlib1g-dev libpython3-dev libavcodec-dev libavformat-dev libswscale-dev libopenblas-base libopenmpi-dev -y +git clone --branch v0.11.1 https://github.com/pytorch/vision torchvision cd torchvision -git checkout tags/v0.11.1 -b v0.11.1 export BUILD_VERSION=0.11.1 pip install -e . ``` +```{note} +It takes about 30 minutes to install torchvision on a Jetson Nano. So, please be patient until the installation is complete. +``` + If you install other versions of PyTorch and torchvision, make sure the versions are compatible. Refer to the compatibility chart listed [here](https://pypi.org/project/torchvision/). ### CMake @@ -86,8 +100,7 @@ We use the latest cmake v3.23.1 released in April 2022. ```shell # purge existing -sudo apt-get purge cmake -sudo snap remove cmake +sudo apt-get purge cmake -y # install prebuilt binary export CMAKE_VER=3.23.1 @@ -101,14 +114,13 @@ cmake --version ## Install Dependencies The Model Converter of MMDeploy on Jetson platforms depends on [MMCV](https://github.com/open-mmlab/mmcv) and the inference engine [TensorRT](https://developer.nvidia.com/tensorrt). -While MMDeploy C/C++ Inference SDK relies on [spdlog](https://github.com/gabime/spdlog), OpenCV and [ppl.cv](https://github.com/openppl-public/ppl.cv) and so on, as well as TensorRT. +While MMDeploy C/C++ Inference SDK relies on [spdlog](https://github.com/gabime/spdlog), OpenCV and [ppl.cv](https://github.com/openppl-public/ppl.cv) and so on, as well as TensorRT. Thus, in the following sections, we will describe how to prepare TensorRT. And then, we will present the way to install dependencies of Model Converter and C/C++ Inference SDK respectively. ### Prepare TensorRT -TensorRT is already packed into JetPack SDK. However, in order to import it successfully in the conda environment, -we need to copy the tensorrt package to the conda environment created before. +TensorRT is already packed into JetPack SDK. But In order to import it successfully in conda environment, we need to copy the tensorrt package to the conda environment created before. ```shell cp -r /usr/lib/python${PYTHON_VERSION}/dist-packages/tensorrt* ~/archiconda3/envs/mmdeploy/lib/python${PYTHON_VERSION}/site-packages/ @@ -119,7 +131,9 @@ python -c "import tensorrt; print(tensorrt.__version__)" # Will print the versio # set environment variable for building mmdeploy later on export TENSORRT_DIR=/usr/include/aarch64-linux-gnu -# append cuda path and libraries to PATH and LD_LIBRARY_PATH, which is also used for building mmdeploy later on +# append cuda path and libraries to PATH and LD_LIBRARY_PATH, which is also used for building mmdeploy later on. +# this is not needed if you use NVIDIA SDK Manager with "Jetson SDK Components" for installing JetPack. +# this is only needed if you install JetPack using SD Card Image Method. export PATH=$PATH:/usr/local/cuda/bin export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 ``` @@ -130,6 +144,8 @@ You can also make the above environment variables permanent by adding them to `~ echo -e '\n# set environment variable for TensorRT' >> ~/.bashrc echo 'export TENSORRT_DIR=/usr/include/aarch64-linux-gnu' >> ~/.bashrc +# this is not needed if you use NVIDIA SDK Manager with "Jetson SDK Components" for installing JetPack. +# this is only needed if you install JetPack using SD Card Image Method. echo -e '\n# set environment variable for CUDA' >> ~/.bashrc echo 'export PATH=$PATH:/usr/local/cuda/bin' >> ~/.bashrc echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64' >> ~/.bashrc @@ -140,57 +156,66 @@ conda activate mmdeploy ### Install Dependencies for Model Converter -- Install [MMCV](https://github.com/open-mmlab/mmcv) +#### Install MMCV - MMCV hasn't provided prebuilt package for Jetson platforms, so we have to build it from source. +[MMCV](https://github.com/open-mmlab/mmcv) has not provided prebuilt package for Jetson platforms, so we have to build it from source. - ```shell - sudo apt-get install -y libssl-dev - git clone https://github.com/open-mmlab/mmcv.git - cd mmcv - git checkout v1.4.0 - MMCV_WITH_OPS=1 pip install -e . - ``` +```shell +sudo apt-get install -y libssl-dev +git clone --branch v1.4.0 https://github.com/open-mmlab/mmcv.git +cd mmcv +MMCV_WITH_OPS=1 pip install -e . +``` -- Install ONNX +```{note} +It takes about 1 hour 40 minutes to install MMCV on a Jetson Nano. So, please be patient until the installation is complete. +``` - ```shell - pip install onnx - ``` +#### Install ONNX -- Install h5py +```shell +pip install onnx +``` - Model Converter employs HDF5 to save the calibration data for TensorRT INT8 quantization. +#### Install h5py - ```shell - sudo apt-get install -y pkg-config libhdf5-100 libhdf5-dev - pip install versioned-hdf5 - ``` +Model Converter employs HDF5 to save the calibration data for TensorRT INT8 quantization. -### Install Dependencies for SDK +```shell +sudo apt-get install -y pkg-config libhdf5-100 libhdf5-dev +pip install versioned-hdf5 +``` -You can skip this section if you don't need MMDeploy C/C++ Inference SDK. +```{note} +It takes about 6 minutes to install versioned-hdf5 on a Jetson Nano. So, please be patient until the installation is complete. +``` -- Install [spdlog](https://github.com/gabime/spdlog) +### Install Dependencies for C/C++ Inference SDK - "`spdlog` is a very fast, header-only/compiled, C++ logging library" +#### Install spdlog - ```shell - sudo apt-get install -y libspdlog-dev - ``` +[spdlog](https://github.com/gabime/spdlog) is a very fast, header-only/compiled, C++ logging library -- Install [ppl.cv](https://github.com/openppl-public/ppl.cv) +```shell +sudo apt-get install -y libspdlog-dev +``` - "`ppl.cv` is a high-performance image processing library of [OpenPPL](https://openppl.ai/home)" +#### Install ppl.cv - ```shell - git clone https://github.com/openppl-public/ppl.cv.git - cd ppl.cv - export PPLCV_DIR=$(pwd) - echo -e '\n# set environment variable for ppl.cv' >> ~/.bashrc - echo "export PPLCV_DIR=$(pwd)" >> ~/.bashrc - ./build.sh cuda - ``` +[ppl.cv](https://github.com/openppl-public/ppl.cv) is a high-performance image processing library of [openPPL](https://openppl.ai/home) + +```shell +git clone https://github.com/openppl-public/ppl.cv.git +cd ppl.cv +export PPLCV_DIR=$(pwd) +echo -e '\n# set environment variable for ppl.cv' >> ~/.bashrc +echo "export PPLCV_DIR=$(pwd)" >> ~/.bashrc +./build.sh cuda +``` + +```{note} +It takes about 15 minutes to install ppl.cv on a Jetson Nano. So, please be patient until the installation is complete. +``` ## Install MMDeploy @@ -202,8 +227,7 @@ export MMDEPLOY_DIR=$(pwd) ### Install Model Converter -Since some operators adopted by OpenMMLab codebases are not supported by TenorRT, -we build the custom TensorRT plugins to make it up, such as `roi_align`, `scatternd`, etc. +Since some operators adopted by OpenMMLab codebases are not supported by TensorRT, we build the custom TensorRT plugins to make it up, such as `roi_align`, `scatternd`, etc. You can find a full list of custom plugins from [here](../ops/tensorrt.md). ```shell @@ -220,39 +244,83 @@ pip install -v -e . # thus any local modifications made to the code will take effect without re-installation. ``` -### Install C/C++ Inference SDK +```{note} +It takes about 5 minutes to install model converter on a Jetson Nano. So, please be patient until the installation is complete. +``` -You can skip this section if you don't need MMDeploy C/C++ Inference SDK. +### Install C/C++ Inference SDK 1. Build SDK Libraries - ```shell - mkdir -p build && cd build - cmake .. \ - -DMMDEPLOY_BUILD_SDK=ON \ - -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \ - -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \ - -DMMDEPLOY_TARGET_BACKENDS="trt" \ - -DMMDEPLOY_CODEBASES=all \ - -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl - make -j$(nproc) && make install - ``` +```shell +mkdir -p build && cd build +cmake .. \ + -DMMDEPLOY_BUILD_SDK=ON \ + -DMMDEPLOY_BUILD_SDK_PYTHON_API=ON \ + -DMMDEPLOY_TARGET_DEVICES="cuda;cpu" \ + -DMMDEPLOY_TARGET_BACKENDS="trt" \ + -DMMDEPLOY_CODEBASES=all \ + -Dpplcv_DIR=${PPLCV_DIR}/cuda-build/install/lib/cmake/ppl +make -j$(nproc) && make install +``` + +```{note} +It takes about 9 minutes to build SDK libraries on a Jetson Nano. So, please be patient until the installation is complete. +``` 2. Build SDK demos - ```shell - cd ${MMDEPLOY_DIR}/build/install/example - mkdir -p build && cd build - cmake .. -DMMDeploy_DIR=${MMDEPLOY_DIR}/build/install/lib/cmake/MMDeploy - make -j$(nproc) - ``` +```shell +cd ${MMDEPLOY_DIR}/build/install/example +mkdir -p build && cd build +cmake .. -DMMDeploy_DIR=${MMDEPLOY_DIR}/buildinstall/lib/cmake/MMDeploy +make -j$(nproc) +``` + +### Run a Demo + +#### Object Detection demo -3. Run a demo +Before running this demo, you need to convert model files to be able to use with this SDK. - Take the object detection for example: - ```shell - ./object_detection cuda ${directory/to/the/converted/models} ${path/to/an/image} - ``` +1. Install [MMDetection](https://github.com/open-mmlab/mmdetection) which is needed for model conversion + +MMDetection is an open source object detection toolbox based on PyTorch + +```shell +git clone https://github.com/open-mmlab/mmdetection.git +cd mmdetection +pip install -r requirements/build.txt +pip install -v -e . # or "python setup.py develop" +``` + +2. Follow [this document](https://github.com/open-mmlab/mmdeploy/blob/master/docs/en/tutorials/how_to_convert_model.md) on how to convert model files. + +For this example, we have used [retinanet_r18_fpn_1x_coco.py](https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/retinanet_r18_fpn_1x_coco.py) as the model config, and [this file](https://download.openmmlab.com/mmdetection/v2.0/retinanet/retinanet_r18_fpn_1x_coco/retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth) as the corresponding checkpoint file. Also for deploy config, we have used [detection_tensorrt_dynamic-320x320-1344x1344.py](https://github.com/open-mmlab/mmdeploy/blob/master/configs/mmdet/detection/detection_tensorrt_dynamic-320x320-1344x1344.py) + +```shell +python ./tools/deploy.py \ + configs/mmdet/detection/detection_tensorrt_dynamic-320x320-1344x1344.py \ + $PATH_TO_MMDET/configs/retinanet/retinanet_r18_fpn_1x_coco.py \ + retinanet_r18_fpn_1x_coco_20220407_171055-614fd399.pth \ + $PATH_TO_MMDET/demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cuda:0 \ + --dump-info +``` + +3. Finally run inference on an image + +
+ +```shell +./object_detection cuda ${directory/to/the/converted/models} ${path/to/an/image} +``` + +
+ +The above inference is done on a [Seeed reComputer built with Jetson Nano module](https://www.seeedstudio.com/Jetson-10-1-A0-p-5336.html) ## Troubleshooting @@ -260,13 +328,8 @@ You can skip this section if you don't need MMDeploy C/C++ Inference SDK. - `pip install` throws an error like `Illegal instruction (core dumped)` - ```shell - echo '# set env for pip' >> ~/.bashrc - echo 'export OPENBLAS_CORETYPE=ARMV8' >> ~/.bashrc - source ~/.bashrc - ``` + Check if you are using any mirror, if you did, try this: - If the steps above don't work, check if you are using any mirror. If so, try this: ```shell rm .condarc conda clean -i From e057b87fd151aeae28123a05f1f928aa13b04f5d Mon Sep 17 00:00:00 2001 From: Yifan Gu Date: Fri, 20 May 2022 02:48:09 -0400 Subject: [PATCH 40/51] fix: bbox_nms not onnxizing if batch size > 1 (#501) A typo prevents nms from onnxizing correctly if batch size is static and greater than 1. --- mmdeploy/codebase/mmdet/core/post_processing/bbox_nms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdeploy/codebase/mmdet/core/post_processing/bbox_nms.py b/mmdeploy/codebase/mmdet/core/post_processing/bbox_nms.py index ee7a1403d7..446a6ec7f2 100644 --- a/mmdeploy/codebase/mmdet/core/post_processing/bbox_nms.py +++ b/mmdeploy/codebase/mmdet/core/post_processing/bbox_nms.py @@ -202,7 +202,7 @@ def multiclass_nms__default(ctx, """ deploy_cfg = ctx.cfg batch_size = boxes.size(0) - if not is_dynamic_batch(deploy_cfg) and batch_size != 1: + if not is_dynamic_batch(deploy_cfg) and batch_size == 1: return _multiclass_nms_single( boxes, scores, From a4de9f370493b83226de60b2c64e4b988b435ca8 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Fri, 20 May 2022 15:20:44 +0800 Subject: [PATCH 41/51] change seperator of function marker (#499) --- mmdeploy/core/optimizers/function_marker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmdeploy/core/optimizers/function_marker.py b/mmdeploy/core/optimizers/function_marker.py index 5ad0501593..57ab7ff19c 100644 --- a/mmdeploy/core/optimizers/function_marker.py +++ b/mmdeploy/core/optimizers/function_marker.py @@ -154,7 +154,7 @@ def impl(ys, prefix, level): if ys not in visit: visit.add(ys) root = ctx.names[ctx.index] - name = '/'.join(str(x) for x in (root, *prefix)) + name = '.'.join(str(x) for x in (root, *prefix)) ys_shape = tuple(int(s) for s in ys.shape) ret = Mark.apply(ys, ys.dtype, ys_shape, func, func_id, io_type, name, index, attrs) From 57baf217f12d6260c61a6c49f6ab11d3cfe298e7 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 23 May 2022 14:29:05 +0800 Subject: [PATCH 42/51] [docs] Fix typo in tutorial (#509) --- docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md b/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md index e7f8d2ed99..474fbf0367 100644 --- a/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md +++ b/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md @@ -279,7 +279,7 @@ def _interpolate_helper(name, dim, interpolate_mode): ## 总结 在这篇教程中,我们系统地介绍了 PyTorch 转 ONNX 的原理。我们先是着重讲解了使用最频繁的 `torch.onnx.export`函数,又给出了查询 PyTorch 对 ONNX 算子支持情况的方法。通过本文,我们希望大家能够成功转换出大部分不需要添加新算子的 ONNX 模型,并在碰到算子问题时能够有效定位问题原因。具体而言,大家读完本文后应该了解以下的知识: - 跟踪法和脚本化在导出带控制语句的计算图时有什么区别。 -- `torch.onnx.export()`中该如何设置 i`nput_names, output_names, dynamic_axes`。 +- `torch.onnx.export()`中该如何设置 `input_names, output_names, dynamic_axes`。 - 使用 `torch.onnx.is_in_onnx_export()`来使模型在转换到 ONNX 时有不同的行为。 - 如何查询 [ONNX 算子文档](https://github.com/onnx/onnx/blob/main/docs/Operators.md)。 - 如何查询 PyTorch 对某个 ONNX 版本的新特性支持情况。 From de3f18fbb28ebec67d2382085c52d766056c1657 Mon Sep 17 00:00:00 2001 From: tripleMu <92794867+triple-Mu@users.noreply.github.com> Date: Mon, 23 May 2022 15:08:18 +0800 Subject: [PATCH 43/51] Fix docstring format (#495) * Fix doc common * Fix bugs --- mmdeploy/apis/calibration.py | 14 +++++++------- mmdeploy/apis/inference.py | 12 ++++++------ mmdeploy/apis/pytorch2onnx.py | 12 ++++++------ mmdeploy/apis/visualize.py | 8 ++++---- mmdeploy/backend/tensorrt/onnx2tensorrt.py | 6 +++--- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/mmdeploy/apis/calibration.py b/mmdeploy/apis/calibration.py index 1939d502fa..57000bef7e 100644 --- a/mmdeploy/apis/calibration.py +++ b/mmdeploy/apis/calibration.py @@ -24,15 +24,15 @@ def create_calib_table(calib_file: str, Examples: >>> from mmdeploy.apis import create_calib_table >>> from mmdeploy.utils import get_calib_filename, load_config - >>> deploy_cfg = 'configs/mmdet/detection/' \ - 'detection_tensorrt-int8_dynamic-320x320-1344x1344.py' + >>> deploy_cfg = ('configs/mmdet/detection/' + 'detection_tensorrt-int8_dynamic-320x320-1344x1344.py') >>> deploy_cfg = load_config(deploy_cfg)[0] >>> calib_file = get_calib_filename(deploy_cfg) - >>> model_cfg = 'mmdetection/configs/fcos/' \ - 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' - >>> model_checkpoint = 'checkpoints/' \ - 'fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth' - >>> create_calib_table(calib_file, deploy_cfg, \ + >>> model_cfg = ('mmdetection/configs/fcos/' + 'fcos_r50_caffe_fpn_gn-head_1x_coco.py') + >>> model_checkpoint = ('checkpoints/' + 'fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth') + >>> create_calib_table(calib_file, deploy_cfg, model_cfg, model_checkpoint, device='cuda:0') Args: diff --git a/mmdeploy/apis/inference.py b/mmdeploy/apis/inference.py index 47bd204322..f3babb3638 100644 --- a/mmdeploy/apis/inference.py +++ b/mmdeploy/apis/inference.py @@ -16,15 +16,15 @@ def inference_model(model_cfg: Union[str, mmcv.Config], Examples: >>> from mmdeploy.apis import inference_model - >>> model_cfg = 'mmdetection/configs/fcos/' \ - 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' - >>> deploy_cfg = 'configs/mmdet/detection/' \ - 'detection_onnxruntime_dynamic.py' + >>> model_cfg = ('mmdetection/configs/fcos/' + 'fcos_r50_caffe_fpn_gn-head_1x_coco.py') + >>> deploy_cfg = ('configs/mmdet/detection/' + 'detection_onnxruntime_dynamic.py') >>> backend_files = ['work_dir/fcos.onnx'] >>> img = 'demo.jpg' >>> device = 'cpu' - >>> model_output = inference_model(model_cfg, deploy_cfg, \ - backend_files, img, device) + >>> model_output = inference_model(model_cfg, deploy_cfg, + backend_files, img, device) Args: model_cfg (str | mmcv.Config): Model config file or Config object. diff --git a/mmdeploy/apis/pytorch2onnx.py b/mmdeploy/apis/pytorch2onnx.py index e9912bc89b..b72f2b5f32 100644 --- a/mmdeploy/apis/pytorch2onnx.py +++ b/mmdeploy/apis/pytorch2onnx.py @@ -69,12 +69,12 @@ def torch2onnx(img: Any, >>> img = 'demo.jpg' >>> work_dir = 'work_dir' >>> save_file = 'fcos.onnx' - >>> deploy_cfg = 'configs/mmdet/detection/' \ - 'detection_onnxruntime_dynamic.py' - >>> model_cfg = 'mmdetection/configs/fcos/' \ - 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' - >>> model_checkpoint = 'checkpoints/' \ - 'fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth' + >>> deploy_cfg = ('configs/mmdet/detection/' + 'detection_onnxruntime_dynamic.py') + >>> model_cfg = ('mmdetection/configs/fcos/' + 'fcos_r50_caffe_fpn_gn-head_1x_coco.py') + >>> model_checkpoint = ('checkpoints/' + 'fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth') >>> device = 'cpu' >>> torch2onnx(img, work_dir, save_file, deploy_cfg, \ model_cfg, model_checkpoint, device) diff --git a/mmdeploy/apis/visualize.py b/mmdeploy/apis/visualize.py index ade0a21fe8..251880ed3e 100644 --- a/mmdeploy/apis/visualize.py +++ b/mmdeploy/apis/visualize.py @@ -21,10 +21,10 @@ def visualize_model(model_cfg: Union[str, mmcv.Config], Examples: >>> from mmdeploy.apis import visualize_model - >>> model_cfg = 'mmdetection/configs/fcos/' \ - 'fcos_r50_caffe_fpn_gn-head_1x_coco.py' - >>> deploy_cfg = 'configs/mmdet/detection/' \ - 'detection_onnxruntime_dynamic.py' + >>> model_cfg = ('mmdetection/configs/fcos/' + 'fcos_r50_caffe_fpn_gn-head_1x_coco.py') + >>> deploy_cfg = ('configs/mmdet/detection/' + 'detection_onnxruntime_dynamic.py') >>> model = 'work_dir/fcos.onnx' >>> img = 'demo.jpg' >>> device = 'cpu' diff --git a/mmdeploy/backend/tensorrt/onnx2tensorrt.py b/mmdeploy/backend/tensorrt/onnx2tensorrt.py index f0e316e468..d50359e2b3 100644 --- a/mmdeploy/backend/tensorrt/onnx2tensorrt.py +++ b/mmdeploy/backend/tensorrt/onnx2tensorrt.py @@ -26,10 +26,10 @@ def onnx2tensorrt(work_dir: str, >>> work_dir = 'work_dir' >>> save_file = 'end2end.engine' >>> model_id = 0 - >>> deploy_cfg = 'configs/mmdet/detection/' \ - 'detection_tensorrt_dynamic-320x320-1344x1344.py' + >>> deploy_cfg = ('configs/mmdet/detection/' + 'detection_tensorrt_dynamic-320x320-1344x1344.py') >>> onnx_model = 'work_dir/end2end.onnx' - >>> onnx2tensorrt(work_dir, save_file, model_id, deploy_cfg, \ + >>> onnx2tensorrt(work_dir, save_file, model_id, deploy_cfg, onnx_model, 'cuda:0') Args: From d16720b1271ce899a72da3b74ec820ee7f8973ff Mon Sep 17 00:00:00 2001 From: Yifan Zhou Date: Tue, 24 May 2022 11:33:01 +0800 Subject: [PATCH 44/51] Tutorial 04: onnx custom op (#508) * Add tutorial04 * lint * add image * resolve comment --- docs/zh_cn/tutorials/chapter_02_challenges.md | 2 +- .../tutorials/chapter_03_pytorch2onnx.md | 2 +- .../tutorials/chapter_04_onnx_custom_op.md | 464 ++++++++++++++++++ 3 files changed, 466 insertions(+), 2 deletions(-) create mode 100644 docs/zh_cn/tutorials/chapter_04_onnx_custom_op.md diff --git a/docs/zh_cn/tutorials/chapter_02_challenges.md b/docs/zh_cn/tutorials/chapter_02_challenges.md index 5e45bd7d91..705e4b6fd0 100644 --- a/docs/zh_cn/tutorials/chapter_02_challenges.md +++ b/docs/zh_cn/tutorials/chapter_02_challenges.md @@ -351,4 +351,4 @@ cv2.imwrite("face_ort_3.png", ort_output) - 通过修改继承自 torch.autograd.Function 的算子的 symbolic 方法,可以改变该算子映射到 ONNX 算子的行为。 -至此,"部署第一个模型“的教程算是告一段落了。是不是觉得学到的知识还不够多?没关系,在接下来的几篇教程中,我们将结合 MMDeploy ,重点介绍 ONNX 中间表示和 ONNX Runtime/TensorRT 推理引擎的知识,让大家学会如何部署更复杂的模型。敬请期待! +至此,"部署第一个模型“的教程算是告一段落了。是不是觉得学到的知识还不够多?没关系,在接下来的几篇教程中,我们将结合 MMDeploy ,重点介绍 ONNX 中间表示和 ONNX Runtime/TensorRT 推理引擎的知识,让大家学会如何部署更复杂的模型。 diff --git a/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md b/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md index 474fbf0367..4a549f6072 100644 --- a/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md +++ b/docs/zh_cn/tutorials/chapter_03_pytorch2onnx.md @@ -285,7 +285,7 @@ def _interpolate_helper(name, dim, interpolate_mode): - 如何查询 PyTorch 对某个 ONNX 版本的新特性支持情况。 - 如何判断 PyTorch 对某个 ONNX 算子是否支持,支持的方法是怎样的。 -这期介绍的知识比较抽象,大家会不会觉得有点“水”?没关系,下一期教程中,我们将以给出代码实例的形式,介绍多种为 PyTorch 转 ONNX 添加算子支持的方法,为大家在 PyTorch 转 ONNX 这条路上扫除更多的障碍。敬请期待哦! +这期介绍的知识比较抽象,大家会不会觉得有点“水”?没关系,下一篇教程中,我们将以给出代码实例的形式,介绍多种为 PyTorch 转 ONNX 添加算子支持的方法,为大家在 PyTorch 转 ONNX 这条路上扫除更多的障碍。 ## 练习 1. Asinh 算子出现于第 9 个 ONNX 算子集。PyTorch 在 9 号版本的符号表文件中是怎样支持这个算子的? 2. BitShift 算子出现于第11个 ONNX 算子集。PyTorch 在 11 号版本的符号表文件中是怎样支持这个算子的? diff --git a/docs/zh_cn/tutorials/chapter_04_onnx_custom_op.md b/docs/zh_cn/tutorials/chapter_04_onnx_custom_op.md new file mode 100644 index 0000000000..c7e348cd78 --- /dev/null +++ b/docs/zh_cn/tutorials/chapter_04_onnx_custom_op.md @@ -0,0 +1,464 @@ +# 模型部署入门教程(四):在 PyTorch 中支持更多 ONNX 算子 + +在[上一篇教程](./chapter_03_pytorch2onnx.md)中,我们系统地学习了 PyTorch 转 ONNX 的方法,可以发现 PyTorch 对 ONNX 的支持还不错。但在实际的部署过程中,难免碰到模型无法用原生 PyTorch 算子表示的情况。这个时候,我们就得考虑扩充 PyTorch,即在 PyTorch 中支持更多 ONNX 算子。 + +而要使 PyTorch 算子顺利转换到 ONNX ,我们需要保证以下三个环节都不出错: + +* 算子在 PyTorch 中有实现 +* 有把该 PyTorch 算子映射成一个或多个 ONNX 算子的方法 +* ONNX 有相应的算子 + +可在实际部署中,这三部分的内容都可能有所缺失。其中最坏的情况是:我们定义了一个全新的算子,它不仅缺少 PyTorch 实现,还缺少 PyTorch 到 ONNX 的映射关系。但所谓车到山前必有路,对于这三个环节,我们也分别都有以下的添加支持的方法: + +* PyTorch 算子 + * 组合现有算子 + * 添加 TorchScript 算子 + * 添加普通 C++ 拓展算子 +* 映射方法 + * 为 ATen 算子添加符号函数 + * 为 TorchScript 算子添加符号函数 + * 封装成 torch.autograd.Function 并添加符号函数 +* ONNX 算子 + * 使用现有 ONNX 算子 + * 定义新 ONNX 算子 + +那么,面对不同的情况时,就需要我们灵活地选用和组合这些方法。听起来是不是很复杂?别担心,本篇文章中,我们将围绕着三种算子映射方法,学习三个添加算子支持的实例,来理清如何为 PyTorch 算子转 ONNX 算子的三个环节添加支持。 + +## 支持 ATen 算子 +实际的部署过程中,我们都有可能会碰到一个最简单的算子缺失问题: 算子在 ATen 中已经实现了,ONNX 中也有相关算子的定义,但是相关算子映射成 ONNX 的规则没有写。在这种情况下,我们只需要**为 ATen 算子补充描述映射规则的符号函数**就行了。 + +> [ATen](https://pytorch.org/cppdocs/#aten) 是 PyTorch 内置的 C++ 张量计算库,PyTorch 算子在底层绝大多数计算都是用 ATen 实现的。 + +上期习题中,我们曾经提到了 ONNX 的 `Asinh` 算子。这个算子在 ATen 中有实现,却缺少了映射到 ONNX 算子的符号函数。在这里,我们来尝试为它补充符号函数,并导出一个包含这个算子的 ONNX 模型。 + +### 获取 ATen 中算子接口定义 +为了编写符号函数,我们需要获得 `asinh` 推理接口的输入参数定义。这时,我们要去 `torch/_C/_VariableFunctions.pyi` 和 `torch/nn/functional.pyi` 这两个文件中搜索我们刚刚得到的这个算子名。这两个文件是编译 PyTorch 时本地自动生成的文件,里面包含了 ATen 算子的 PyTorch 调用接口。通过搜索,我们可以知道 `asinh` 在文件 `torch/_C/_VariableFunctions.pyi` 中,其接口定义为: + +```python +def asinh(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ... +``` + +经过这些步骤,我们确认了缺失的算子名为 `asinh`,它是一个有实现的 ATen 算子。我们还记下了 `asinh` 的调用接口。接下来,我们要为它补充符号函数,使它在转换成 ONNX 模型时不再报错。 + +### 添加符号函数 +到目前为止,我们已经多次接触了定义 PyTorch 到 ONNX 映射规则的符号函数了。现在,我们向大家正式介绍一下符号函数。 + +符号函数,可以看成是 PyTorch 算子类的一个静态方法。在把 PyTorch 模型转换成 ONNX 模型时,各个 PyTorch 算子的符号函数会被依次调用,以完成 PyTorch 算子到 ONNX 算子的转换。符号函数的定义一般如下: + +```python +def symbolic(g: torch._C.Graph, input_0: torch._C.Value, input_1: torch._C.Value, ...): +``` + +其中,`torch._C.Graph` 和 `torch._C.Value` 都对应 PyTorch 的 C++ 实现里的一些类。我们在这篇文章不深究它们的细节,只需要知道第一个参数就固定叫 `g`,它表示和计算图相关的内容;后面的每个参数都表示算子的输入,需要和算子的前向推理接口的输入相同。对于 ATen 算子来说,它们的前向推理接口就是上述两个 `.pyi` 文件里的函数接口。 + +`g` 有一个方法 `op`。在把 PyTorch 算子转换成 ONNX 算子时,需要在符号函数中调用此方法来为最终的计算图添加一个 ONNX 算子。其定义如下: + +```python +def op(name: str, input_0: torch._C.Value, input_1: torch._C.Value, ...) +``` + +其中,第一个参数是算子名称。如果该算子是普通的 ONNX 算子,只需要把它在 ONNX 官方文档里的名称填进去即可(我们稍后再讲其他情况)。 + +在最简单的情况下,我们只要把 PyTorch 算子的输入用`g.op()`一一对应到 ONNX 算子上即可,并把`g.op()`的返回值作为符号函数的返回值。在情况更复杂时,我们转换一个 PyTorch 算子可能要新建若干个 ONNX 算子。 + +补充完了背景知识,让我们回到 `asinh` 算子上,来为它编写符号函数。我们先去翻阅一下 ONNX 算子文档,学习一下我们在符号函数里的映射关系 `g.op()` 里应该怎么写。[`Asinh` 的文档](https://github.com/onnx/onnx/blob/main/docs/Operators.md#asinh)写道:该算子有一个输入 `input`,一个输出 `output`,二者的类型都为张量。 + +到这里,我们已经完成了信息收集环节。我们在上一小节得知了 `asinh` 的推理接口定义,在这一小节里收集了 ONNX 算子 `Asinh` 的定义。现在,我们可以用代码来补充这二者的映射关系了。在刚刚导出 `asinh` 算子的代码中,我们添加以下内容: + +```python +from torch.onnx.symbolic_registry import register_op + +def asinh_symbolic(g, input, *, out=None): + return g.op("Asinh", input) + +register_op('asinh', asinh_symbolic, '', 9) +``` + +这里的`asinh_symbolic`就是`asinh`的符号函数。从除`g`以外的第二个输入参数开始,其输入参数应该严格对应它在 ATen 中的定义: + +```python +def asinh(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ... +``` + +在符号函数的函数体中,`g.op("Asinh", input)`则完成了 ONNX 算子的定义。其中,第一个参数`"Asinh"`是算子在 ONNX 中的名称。至于第二个参数 `input`,如我们刚刚在文档里所见,这个算子只有一个输入,因此我们只要把符号函数的输入参数 `input` 对应过去就行。ONNX 的 `Asinh` 的输出和 ATen 的 `asinh` 的输出是一致的,因此我们直接把 `g.op()` 的结果返回即可。 + +定义完符号函数后,我们要把这个符号函数和原来的 ATen 算子“绑定”起来。这里,我们要用到 `register_op` 这个 PyTorch API 来完成绑定。如示例所示,只需要一行简单的代码即可把符号函数 `asinh_symbolic` 绑定到算子 `asinh` 上: + +```python +register_op('asinh', asinh_symbolic, '', 9) +``` + +`register_op`的第一个参数是目标 ATen 算子名,第二个是要注册的符号函数,这两个参数很好理解。第三个参数是算子的“域”,对于普通 ONNX 算子,直接填空字符串即可。第四个参数表示向哪个算子集版本注册。我们遵照 ONNX 标准,向第 9 号算子集注册。值得注意的是,这里向第 9 号算子集注册,不代表较新的算子集(第 10 号、第 11 号……)都得到了注册。在示例中,我们先只向第 9 号算子集注册。 + +整理一下,我们最终的代码如下: + +```python +import torch + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.asinh(x) + +from torch.onnx.symbolic_registry import register_op + +def asinh_symbolic(g, input, *, out=None): + return g.op("Asinh", input) + +register_op('asinh', asinh_symbolic, '', 9) + +model = Model() +input = torch.rand(1, 3, 10, 10) +torch.onnx.export(model, input, 'asinh.onnx') +``` + +成功导出的话,`asinh.onnx` 应该长这个样子: + +![](https://user-images.githubusercontent.com/47652064/169744691-f14e4fd4-c777-4562-aaa5-a5bf888f21f8.png) + +### 测试算子 +在完成了一份自定义算子后,我们一定要测试一下算子的正确性。一般我们要用 PyTorch 运行一遍原算子,再用推理引擎(比如 ONNX Runtime)运行一下 ONNX 算子,最后比对两次的运行结果。对于我们刚刚得到的 `asinh.onnx`,可以用如下代码来验证: + +```python +import onnxruntime +import torch +import numpy as np + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch.asinh(x) + +model = Model() +input = torch.rand(1, 3, 10, 10) +torch_output = model(input).detach().numpy() + +sess = onnxruntime.InferenceSession('asinh.onnx') +ort_output = sess.run(None, {'0': input.numpy()})[0] + +assert np.allclose(torch_output, ort_output) +``` + +在这份代码里,我们用 PyTorch 做了一遍推理,并把结果转成了 numpy 格式。之后,我们又用 ONNX Runtime 对 onnx 文件做了一次推理。最后,我们使用 `np.allclose` 来保证两个结果张量的误差在一个可以允许的范围内。一切正常的话,运行这段代码后,`assert` 所在行不会报错,程序应该没有任何输出。 + +## 支持 TorchScript 算子 +对于一些比较复杂的运算,仅使用 PyTorch 原生算子是无法实现的。这个时候,就要考虑自定义一个 PyTorch 算子,再把它转换到 ONNX 中了。新增 PyTorch 算子的方法有很多,PyTorch 官方比较推荐的一种做法是[添加 TorchScript 算子](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html) 。 + +由于添加算子的方法较繁琐,我们今天跳过新增 TorchScript 算子的内容,以可变形卷积(Deformable Convolution)算子为例,介绍为现有 TorchScript 算子添加 ONNX 支持的方法。 + +> 可变形卷积(Deformable Convolution)是在 Torchvision 中实现的 TorchScript 算子,虽然尚未得到广泛支持,但是出现在许多模型中。 + +有了支持 ATen 算子的经验之后,我们可以知道为算子添加符号函数一般要经过以下几步: + +1. 获取原算子的前向推理接口。 +2. 获取目标 ONNX 算子的定义。 +3. 编写符号函数并绑定。 + +在为可变形卷积添加符号函数时,我们也可以尝试走一遍这个流程。 + +### 使用 TorchScript 算子 +和之前一样,我们首先定义一个包含了算子的模型,为之后转换 ONNX 模型做准备。 + +```python +import torch +import torchvision + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 18, 3) + self.conv2 = torchvision.ops.DeformConv2d(3, 3, 3) + + def forward(self, x): + return self.conv2(x, self.conv1(x)) +``` + +其中,`torchvision.ops.DeformConv2d` 就是 Torchvision 中的可变形卷积层。相比于普通卷积,可变形卷积的其他参数都大致相同,唯一的区别就是在推理时需要多输入一个表示偏移量的张量。 + +然后,我们查询算子的前向推理接口。`DeformConv2d` 层最终会调用 `deform_conv2d` 这个算子。我们可以在 `torchvision/csrc/ops/deform_conv2d.cpp` 中查到该算子的调用接口: + +```python +m.def(TORCH_SELECTIVE_SCHEMA( + "torchvision::deform_conv2d(Tensor input, + Tensor weight, + Tensor offset, + ...... + bool use_mask) -> Tensor")); +``` + +那么接下来,根据之前的经验,我们就是要去 ONNX 官方文档中查找算子的定义了。 + +### 自定义 ONNX 算子 +很遗憾的是,如果我们去 ONNX 的官方算子页面搜索 "deform",将搜不出任何内容。目前,ONNX 还没有提供可变形卷积的算子,我们要自己定义一个 ONNX 算子了。 + +我们在前面讲过,`g.op()` 是用来定义 ONNX 算子的函数。对于 ONNX 官方定义的算子,`g.op()` 的第一个参数就是该算子的名称。而对于一个自定义算子,`g.op()` 的第一个参数是一个带命名空间的算子名,比如: + +```python +g.op("custom::deform_conv2d, ...) +``` + +其中,"::"前面的内容就是我们的命名空间。该概念和 C++ 的命名空间类似,是为了防止命名冲突而设定的。如果在 `g.op()` 里不加前面的命名空间,则算子会被默认成 ONNX 的官方算子。 + +PyTorch 在运行 `g.op()` 时会对官方的算子做检查,如果算子名有误,或者算子的输入类型不正确, `g.op()` 就会报错。为了让我们随心所欲地定义新 ONNX 算子,我们必须设定一个命名空间,给算子取个名,再定义自己的算子。 + +我们在[第一篇教程](chapter_01_introduction_to_model_deployment.md)学过:ONNX 是一套标准,本身并不包括实现。在这里,我们就简略地定义一个 ONNX 可变形卷积算子,而不去写它在某个推理引擎上的实现。在之后的教程中,我们再学习在各个推理引擎中添加新 ONNX 算子支持的方法。此处,我们只关心如何导出一个包含新 ONNX 算子节点的 onnx 文件。因此,我们可以为新算子编写如下简单的符号函数: + +```python +@parse_args("v", "v", "v", "v", "v", "i", "i", "i", "i", "i", "i", "i", "i", "none") +def symbolic(g, + input, + weight, + offset, + mask, + bias, + stride_h, stride_w, + pad_h, pad_w, + dil_h, dil_w, + n_weight_grps, + n_offset_grps, + use_mask): + return g.op("custom::deform_conv2d", input, offset) +``` + +在这个符号函数中,我们以刚刚搜索到的算子输入参数作为符号函数的输入参数,并只用 `input` 和 `offset` 来构造一个简单的 ONNX 算子。 + +这段代码中,最令人疑惑的就是装饰器 `@parse_args` 了。简单来说,TorchScript 算子的符号函数要求标注出每一个输入参数的类型。比如"v"表示 Torch 库里的 `value` 类型,一般用于标注张量,而"i"表示 int 类型,"f"表示 float 类型,"none"表示该参数为空。具体的类型含义可以在 [torch.onnx.symbolic_helper.py](https://github.com/pytorch/pytorch/blob/master/torch/onnx/symbolic_helper.py)中查看。这里输入参数中的 `input, weight, offset, mask, bias` 都是张量,所以用"v"表示。后面的其他参数同理。我们不必纠结于 `@parse_args`的原理,根据实际情况对符号函数的参数标注类型即可。 + +有了符号函数后,我们通过如下的方式注册符号函数: + +```python +register_custom_op_symbolic("torchvision::deform_conv2d", symbolic, 9) +``` + +和前面的 `register_op` 类似,注册符号函数时,我们要输入算子名、符号函数、算子集版本。与前面不同的是,这里的算子集版本是最早生效版本,在这里设定版本 9,意味着之后的第 10 号、第 11 号……版本集都能使用这个新算子。 + +最后,我们完整的模型导出代码如下: + +```python +import torch +import torchvision + +class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(3, 18, 3) + self.conv2 = torchvision.ops.DeformConv2d(3, 3, 3) + + def forward(self, x): + return self.conv2(x, self.conv1(x)) + +from torch.onnx import register_custom_op_symbolic +from torch.onnx.symbolic_helper import parse_args + +@parse_args("v", "v", "v", "v", "v", "i", "i", "i", "i", "i", "i", "i", "i", "none") +def symbolic(g, + input, + weight, + offset, + mask, + bias, + stride_h, stride_w, + pad_h, pad_w, + dil_h, dil_w, + n_weight_grps, + n_offset_grps, + use_mask): + return g.op("custom::deform_conv2d", input, offset) + +register_custom_op_symbolic("torchvision::deform_conv2d", symbolic, 9) + +model = Model() +input = torch.rand(1, 3, 10, 10) +torch.onnx.export(model, input, 'dcn.onnx') +``` + +代码成功运行的话,我们应该能得到如下的 ONNX 模型: + + +![](https://user-images.githubusercontent.com/47652064/169744720-51ea91bc-b67b-4911-9e43-0adc1b64d2c1.jpg) + +可以看到,我们自定义的 ONNX 算子 `deform_conv2d` 包含了两个输入,一个输出,和我们预想得一样。 + +## 使用 torch.autograd.Function +最后,我们来学习一种简单的为 PyTorch 添加 C++ 算子实现的方法,来代替较为复杂的新增 TorchScript 算子。同时,我们会用 torch.autograd.Function 封装这个新算子。torch.autograd.Function 能完成算子实现和算子调用的隔离。不管算子是怎么实现的,它封装后的使用体验以及 ONNX 导出方法会和原生的 PyTorch 算子一样。这是我们比较推荐的为算子添加 ONNX 支持的方法。 + +为了应对更复杂的情况,我们来自定义一个奇怪的 `my_add` 算子。这个算子的输入张量 a, b ,输出 `2a + b` 的值。我们会先把它在 PyTorch 中实现,再把它导出到 ONNX 中。 + +### 为 PyTorch 添加 C++ 拓展 +为 PyTorch 添加简单的 C++ 拓展还是很方便的。对于我们定义的 my_add 算子,可以用以下的 C++ 源文件来实现。我们把该文件命名为 "my_add.cpp": + +```C++ +// my_add.cpp + +#include + +torch::Tensor my_add(torch::Tensor a, torch::Tensor b) +{ + return 2 * a + b; +} + +PYBIND11_MODULE(my_lib, m) +{ + m.def("my_add", my_add); +} +``` + +由于在 PyTorch 中添加 C++ 拓展和模型部署关系不大,这里我们仅给出这个简单的示例,并不对其原理做过多讲解。 + +在这段代码中,torch::Tensor 就是 C++ 中 torch 的张量类型,它的加法和乘法等运算符均已重载。因此,我们可以像对普通标量一样对张量做加法和乘法。 + +轻松地完成了算子的实现后,我们用 `PYBIND11_MODULE` 来为 C++ 函数提供 Python 调用接口。这里的 `my_lib` 是我们未来要在 Python 里导入的模块名。双引号中的 `my_add` 是 Python 调用接口的名称,这里我们对齐 C++ 函数的名称,依然用 "my_add"这个名字。 + +之后,我们可以编写如下的 Python 代码并命名为 "setup.py",来编译刚刚的 C++ 文件: + +```python +from setuptools import setup +from torch.utils import cpp_extension + +setup(name='my_add', + ext_modules=[cpp_extension.CppExtension('my_lib', ['my_add.cpp'])], + cmdclass={'build_ext': cpp_extension.BuildExtension}) +``` + +这段代码使用了 Python 的 setuptools 编译功能和 PyTorch 的 C++ 拓展工具函数,可以编译包含了 torch 库的 C++ 源文件。这里我们需要填写的只有模块名和模块中的源文件名。我们刚刚把模块命名为 `my_lib`,而源文件只有一个 `my_add.cpp`,因此拓展模块那一行要写成 `ext_modules=[cpp_extension.CppExtension('my_lib', ['my_add.cpp'])],`。 + +之后,像处理普通的 Python 包一样执行安装命令,我们的 C++ 代码就会自动编译了。 + +```shell +python setup.py develop +``` + +### 用 `torch.autograd.Function` 封装 + +直接用 Python 接口调用 C++ 函数不太“美观”,一种比较优雅的做法是把这个调用接口封装起来。这里我们用 `torch.autograd.Function` 来封装算子的底层调用: + +```python +import torch +import my_lib +class MyAddFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, a, b): + return my_lib.my_add(a, b) + + @staticmethod + def symbolic(g, a, b): + two = g.op("Constant", value_t=torch.tensor([2])) + a = g.op('Mul', a, two) + return g.op('Add', a, b) +``` + +我们在前面的教程中已经见过 `torch.autograd.Function`,这里我们正式地对其做一个介绍。`Function` 类本身表示 PyTorch 的一个可导函数,只要为其定义了前向推理和反向传播的实现,我们就可以把它当成一个普通 PyTorch 函数来使用。 + +PyTorch 会自动调度该函数,合适地执行前向和反向计算。对模型部署来说,`Function` 类有一个很好的性质:如果它定义了 `symbolic` 静态方法,该 `Function` 在执行 `torch.onnx.export()` 时就可以根据 `symbolic` 中定义的规则转换成 ONNX 算子。这个 `symbolic` 就是前面提到的符号函数,只是它的名称必须是 `symbolic` 而已。 + +在 `forward `函数中,我们用 `my_lib.my_add(a, b)` 就可以调用之前写的C++函数了。这里 `my_lib` 是库名,`my_add` 是函数名,这两个名字是在前面C++的 `PYBIND11_MODULE` 中定义的。 + +在 `symbolic` 函数中,我们用 `g.op()` 定义了三个算子:常量、乘法、加法。这里乘法和加法的用法和前面提到的 `asinh` 一样,只需要根据 ONNX 算子定义规则把输入参数填入即可。而在定义常量算子时,我们要把 PyTorch 张量的值传入 `value_t` 参数中。 + +在 ONNX 中,我们需要把新建常量当成一个算子来看待,尽管这个算子并不会以节点的形式出现在 ONNX 模型的可视化结果里。 + +把算子封装成 Function 后,我们可以把 `my_add` 算子用起来了。 + +```python +my_add = MyAddFunction.apply + +class MyAdd(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, a, b): + return my_add(a, b) +``` + +在这份代码里,我们先用 `my_add = MyAddFunction.apply` 获取了一个奇怪的变量。这个变量是用来做什么的呢?其实,`apply`是`torch.autograd.Function` 的一个方法,这个方法完成了 `Function` 在前向推理或者反向传播时的调度。我们在使用 `Function` 的派生类做推理时,不应该显式地调用 `forward`,而应该调用其 `apply` 方法。 + +这里我们使用 `my_add = MyAddFunction.apply` 把这个调用方法取了一个更简短的别名 `my_add`。以后在使用 `my_add` 算子时,我们应该忽略 `MyAddFunction` 的实现细节,而只通过 `my_add` 这个接口来访问算子。这里 `my_add` 的地位,和 PyTorch 的 `asinh, interpolate, conv2d`等原生函数是类似的。 + +有了访问新算子的接口后,我们可以进一步把算子封装成一个神经网络中的计算层。我们定义一个叫做的 `MyAdd` 的 `torch.nn.Module`,它封装了`my_add`,就和封装了`conv2d` 的 `torch.nn.Conv2d` 一样。 + +### 测试算子 +费了好大的功夫来“包装”我们的新算子后,我们终于可以来使用它了。和之前的测试流程一样,让我们用下面的代码来导出一个包含新算子的 ONNX 模型,并验证一下它是否正确。 + +```python +model = MyAdd() +input = torch.rand(1, 3, 10, 10) +torch.onnx.export(model, (input, input), 'my_add.onnx') +torch_output = model(input, input).detach().numpy() + +import onnxruntime +import numpy as np +sess = onnxruntime.InferenceSession('my_add.onnx') +ort_output = sess.run(None, {'a': input.numpy(), 'b': input.numpy()})[0] + +assert np.allclose(torch_output, ort_output) +``` + +在这份代码中,我们直接把 `MyAdd` 作为要导出的模型。我们计算了一个 PyTorch 模型的运行结果,又导出 ONNX 模型,计算了 ONNX 模型在 ONNX Runtime 上的运算结果。如果一切正常的话,这两个结果是一样的,这份代码不会报任何错误,没有任何输出。 + +![](https://user-images.githubusercontent.com/47652064/169744753-0fb00930-bbca-4636-8681-4ec4e7b31946.jpg) + +可视化一下 `my_add.onnx`,可以看出,和我们设计得一样,`my_add` 算子被翻译成了两个 ONNX 算子节点(其中常量算子被放入了 `Mul` 的参数中)。 + +整理一下,整个流程的 Python 代码如下: + +```python +import torch +import my_lib +class MyAddFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, a, b): + return my_lib.my_add(a, b) + + @staticmethod + def symbolic(g, a, b): + two = g.op("Constant", value_t=torch.tensor([2])) + a = g.op('Mul', a, two) + return g.op('Add', a, b) + +my_add = MyAddFunction.apply + +class MyAdd(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, a, b): + return my_add(a, b) + +model = MyAdd() +input = torch.rand(1, 3, 10, 10) +torch.onnx.export(model, (input, input), 'my_add.onnx') +torch_output = model(input, input).detach().numpy() + +import onnxruntime +import numpy as np +sess = onnxruntime.InferenceSession('my_add.onnx') +ort_output = sess.run(None, {'a': input.numpy(), 'b': input.numpy()})[0] + +assert np.allclose(torch_output, ort_output) +``` + +## 总结 + +在这篇教程中,我们围绕“为 ATen 算子添加符号函数”、“为 TorchScript 算子添加符号函数”、“封装成 `torch.autograd.Function` 并添加符号函数”这三种添加映射关系的方法,讲解了 3 个为 PyTorch 和 ONNX 添加支持的实例。在这个过程中,我们学到了很多零散的知识,来总结一下吧。 + +* ATen 是 PyTorch 的 C++ 张量运算库。通过查询 torch/_C/_VariableFunctions.pyi 和 torch/nn/functional.pyi,我们可以知道 ATen 算子的 Python 接口定义。 +* 用 register_op 可以为 ATen 算子补充注册符号函数 +* 用 register_custom_op_symbolic 可以为 TorchScript 算子补充注册符号函数 +* 如何在 PyTorch 里添加 C++ 拓展 +* 如何用 torch.autograd.Function 封装一个自定义 PyTorch 算子 +* 如何编写符号函数 symbolic(g, ...)。 +* 如何用 g.op() 把一个 PyTorch 算子映射成一个或多个 ONNX 算子,或者是自定义的 ONNX 算子。 + +这篇教程涉及的代码比较多。如果大家在阅读时碰到了问题,最好去跑一跑代码,改一改代码里的内容,实际感受一下每行代码的意义。 + +## 上期习题解答 + +1. PyTorch 目前没有支持 ONNX 的 `Asinh` 算子。我们在 `torch.onnx.symbolic_opset9.py` 中搜索不到 Asinh 的相关内容。 +2. 通过在 `torch.onnx.symbolic_opset11.py` 搜索 `BitShift`,我们可以发现 PyTorch 在 `__lshift_` 和 `__rshift_` 里用到了ONNX的 `BitShift` 算子。当输入类型为 `Byte` 时,PyTorch会把算子直接翻译翻译 `BitShift`,以代替乘除 2 的次幂的操作。 +3. 对应 `Resize` 算子的第3个参数(`g.op()` 的第4个参数)`scales`。原来的 `scales` 传入 `g.op() `前会经过 `_interpolate_get_scales_if_available()` 函数,一定会被转换成一个常量。为了让 `scales` 由输入决定,我们直接把输入参数中的 `scales` 传入 `g.op()`。 From 4f49763c28e8acab6802685cc5e562ec8e2cc42d Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 25 May 2022 09:52:42 +0800 Subject: [PATCH 45/51] fix mmseg twice resize (#480) * fix mmseg twich resize * remove comment --- .../mmseg/models/segmentors/encoder_decoder.py | 14 +------------- .../test_codebase/test_mmseg/test_mmseg_models.py | 6 ++++-- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/mmdeploy/codebase/mmseg/models/segmentors/encoder_decoder.py b/mmdeploy/codebase/mmseg/models/segmentors/encoder_decoder.py index bca614ae86..0ed9ace84f 100644 --- a/mmdeploy/codebase/mmseg/models/segmentors/encoder_decoder.py +++ b/mmdeploy/codebase/mmseg/models/segmentors/encoder_decoder.py @@ -1,9 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn.functional as F -from mmseg.ops import resize from mmdeploy.core import FUNCTION_REWRITER -from mmdeploy.utils import is_dynamic_shape @FUNCTION_REWRITER.register_rewriter( @@ -25,16 +23,6 @@ def encoder_decoder__simple_test(ctx, self, img, img_meta, **kwargs): torch.Tensor: Output segmentation map pf shape [N, 1, H, W]. """ seg_logit = self.encode_decode(img, img_meta) - seg_logit = resize( - input=seg_logit, - size=img_meta['img_shape'], - mode='bilinear', - align_corners=self.align_corners) seg_logit = F.softmax(seg_logit, dim=1) - seg_pred = seg_logit.argmax(dim=1) - # our inference backend only support 4D output - shape = seg_pred.shape - if not is_dynamic_shape(ctx.cfg): - shape = [int(_) for _ in shape] - seg_pred = seg_pred.view(shape[0], 1, shape[1], shape[2]) + seg_pred = seg_logit.argmax(dim=1, keepdim=True) return seg_pred diff --git a/tests/test_codebase/test_mmseg/test_mmseg_models.py b/tests/test_codebase/test_mmseg/test_mmseg_models.py index dfcd5b4cdb..d5f2285939 100644 --- a/tests/test_codebase/test_mmseg/test_mmseg_models.py +++ b/tests/test_codebase/test_mmseg/test_mmseg_models.py @@ -93,7 +93,8 @@ def _demo_mm_inputs(input_shape=(1, 3, 8, 16), num_classes=10): return mm_inputs -@pytest.mark.parametrize('backend', [Backend.ONNXRUNTIME, Backend.OPENVINO]) +@pytest.mark.parametrize('backend', + [Backend.ONNXRUNTIME, Backend.OPENVINO, Backend.NCNN]) def test_encoderdecoder_simple_test(backend): check_backend(backend) segmentor = get_model() @@ -109,7 +110,8 @@ def test_encoderdecoder_simple_test(backend): num_classes = segmentor.decode_head[-1].num_classes else: num_classes = segmentor.decode_head.num_classes - mm_inputs = _demo_mm_inputs(num_classes=num_classes) + mm_inputs = _demo_mm_inputs( + input_shape=(1, 3, 32, 32), num_classes=num_classes) imgs = mm_inputs.pop('imgs') img_metas = mm_inputs.pop('img_metas') model_inputs = {'img': imgs, 'img_meta': img_metas} From 0878b8ff7d7ede809ecab0c948baa76f7d756ca7 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Thu, 26 May 2022 13:02:09 +0800 Subject: [PATCH 46/51] Fix mask test with mismatched device (#511) * align mask output to cpu device * align ncnn ssd output to torch.Tensor type * --amend --- .../mmdet/deploy/object_detection_model.py | 53 +++++++++---------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/mmdeploy/codebase/mmdet/deploy/object_detection_model.py b/mmdeploy/codebase/mmdet/deploy/object_detection_model.py index 34b1e8bcc6..a9af63351e 100644 --- a/mmdeploy/codebase/mmdet/deploy/object_detection_model.py +++ b/mmdeploy/codebase/mmdet/deploy/object_detection_model.py @@ -111,37 +111,40 @@ def __clear_outputs( return outputs @staticmethod - def postprocessing_masks(det_bboxes: np.ndarray, - det_masks: np.ndarray, + def postprocessing_masks(det_bboxes: Union[np.ndarray, torch.Tensor], + det_masks: Union[np.ndarray, torch.Tensor], img_w: int, img_h: int, - device: str = 'cpu', - mask_thr_binary: float = 0.5) -> np.ndarray: + device: str = 'cpu') -> torch.Tensor: """Additional processing of masks. Resizes masks from [num_det, 28, 28] to [num_det, img_w, img_h]. Analog of the 'mmdeploy.codebase.mmdet. models.roi_heads.fcn_mask_head._do_paste_mask' function. Args: - det_bboxes (np.ndarray): Bbox of shape [num_det, 4] - det_masks (np.ndarray): Masks of shape [num_det, 28, 28]. + det_bboxes (np.ndarray | Tensor): Bbox of shape [num_det, 4] + det_masks (np.ndarray | Tensor): Masks of shape [num_det, 28, 28]. img_w (int): Width of the original image. img_h (int): Height of the original image. - mask_thr_binary (float): The threshold for the mask. + device :(str): The device type. Returns: - np.ndarray: masks of shape [N, num_det, img_h, img_w]. + torch.Tensor: masks of shape [N, num_det, img_h, img_w]. """ masks = det_masks bboxes = det_bboxes - + device = torch.device(device) num_det = bboxes.shape[0] # Skip postprocessing if no detections are found. if num_det == 0: - return np.zeros((0, img_h, img_w)) + return torch.zeros( + 0, img_h, img_w, dtype=torch.float32, device=device) if isinstance(masks, np.ndarray): - masks = torch.tensor(masks, device=torch.device(device)) - bboxes = torch.tensor(bboxes, device=torch.device(device)) + masks = torch.tensor(masks, device=device) + bboxes = torch.tensor(bboxes, device=device) + + masks = masks.to(device) + bboxes = bboxes.to(device) result_masks = [] for bbox, mask in zip(bboxes, masks): @@ -150,15 +153,9 @@ def postprocessing_masks(det_bboxes: np.ndarray, x1_int, y1_int = img_w, img_h img_y = torch.arange( - y0_int, - y1_int, - dtype=torch.float32, - device=torch.device(device)) + 0.5 + y0_int, y1_int, dtype=torch.float32, device=device) + 0.5 img_x = torch.arange( - x0_int, - x1_int, - dtype=torch.float32, - device=torch.device(device)) + 0.5 + x0_int, x1_int, dtype=torch.float32, device=device) + 0.5 x0, y0, x1, y1 = bbox img_y = (img_y - y0) / (y1 - y0) * 2 - 1 @@ -208,15 +205,13 @@ def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], rescale = kwargs.get('rescale', True) for i in range(batch_size): dets, labels = batch_dets[i], batch_labels[i] - dets = dets.to(device=torch.device(self.device)) if rescale: scale_factor = img_metas[i]['scale_factor'] if isinstance(scale_factor, (list, tuple, np.ndarray)): assert len(scale_factor) == 4 scale_factor = np.array(scale_factor)[None, :] # [1,4] - scale_factor = torch.from_numpy(scale_factor).to( - device=torch.device(self.device)) + scale_factor = torch.from_numpy(scale_factor).to(dets) dets[:, :4] /= scale_factor if 'border' in img_metas[i]: @@ -255,6 +250,8 @@ def forward(self, img: Sequence[torch.Tensor], img_metas: Sequence[dict], masks = masks.squeeze(0) if masks.dtype != bool: masks = masks >= 0.5 + # aligned with mmdet to easily convert to numpy + masks = masks.cpu() segms_results = [[] for _ in range(len(self.CLASSES))] for j in range(len(dets)): segms_results[labels[j]].append(masks[j]) @@ -600,23 +597,21 @@ def forward_test(self, imgs: torch.Tensor, *args, **kwargs) -> List: imgs (torch.Tensor): Input image(s) in [N x C x H x W] format. Returns: - list[np.ndarray]: dets of shape [N, num_det, 5] and + list[torch.Tensor]: dets of shape [N, num_det, 5] and class labels of shape [N, num_det]. """ _, _, H, W = imgs.shape outputs = self.wrapper({self.input_name: imgs}) for key, item in outputs.items(): if item is None: - return [np.zeros((1, 0, 5)), np.zeros((1, 0))] + return torch.zeros(1, 0, 5), torch.zeros(1, 0) out = self.wrapper.output_to_list(outputs)[0] labels = out[:, :, 0] - 1 - scales = torch.tensor([W, H, W, H]).reshape(1, 1, 4) + scales = torch.tensor([W, H, W, H]).reshape(1, 1, 4).to(out) scores = out[:, :, 1:2] boxes = out[:, :, 2:6] * scales dets = torch.cat([boxes, scores], dim=2) - dets = dets.detach().cpu().numpy() - labels = labels.detach().cpu().numpy() - return [dets, labels] + return dets, labels @__BACKEND_MODEL.register_module('sdk') From 32482e76fde68839890ff184cb2d2adb5de11879 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Fri, 27 May 2022 10:58:26 +0800 Subject: [PATCH 47/51] compat mmpose v0.26 (#518) --- .../codebase/mmpose/deploy/pose_detection.py | 19 ++++++++++++------- tests/test_codebase/test_mmpose/data/model.py | 7 +++++++ .../test_mmpose/test_pose_detection.py | 1 - 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/mmdeploy/codebase/mmpose/deploy/pose_detection.py b/mmdeploy/codebase/mmpose/deploy/pose_detection.py index 0405523400..bd02a10f40 100644 --- a/mmdeploy/codebase/mmpose/deploy/pose_detection.py +++ b/mmdeploy/codebase/mmpose/deploy/pose_detection.py @@ -135,7 +135,6 @@ def create_input(self, Returns: tuple: (data, img), meta information for the input image and input. """ - from mmpose.apis.inference import _box2cs from mmpose.datasets.dataset_info import DatasetInfo from mmpose.datasets.pipelines import Compose @@ -160,17 +159,12 @@ def create_input(self, image_size = input_shape else: image_size = np.array(cfg.data_cfg['image_size']) - for bbox in bboxes: - center, scale = _box2cs(cfg, bbox) + for bbox in bboxes: # prepare data data = { 'img': imgs, - 'center': - center, - 'scale': - scale, 'bbox_score': bbox[4] if len(bbox) == 5 else 1, 'bbox_id': @@ -190,6 +184,17 @@ def create_input(self, } } + # for compatibility of mmpose + try: + # for mmpose<=v0.25.1 + from mmpose.apis.inference import _box2cs + center, scale = _box2cs(cfg, bbox) + data['center'] = center + data['scale'] = scale + except ImportError: + # for mmpose>=v0.26.0 + data['bbox'] = bbox + data = test_pipeline(data) batch_data.append(data) diff --git a/tests/test_codebase/test_mmpose/data/model.py b/tests/test_codebase/test_mmpose/data/model.py index 947b396f5d..68aca4e53d 100644 --- a/tests/test_codebase/test_mmpose/data/model.py +++ b/tests/test_codebase/test_mmpose/data/model.py @@ -1,5 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. # model settings +import mmpose +from packaging import version + channel_cfg = dict( num_output_channels=17, dataset_joints=17, @@ -47,6 +50,7 @@ test_pipeline = [ dict(type='LoadImageFromFile'), + # dict(type='TopDownGetBboxCenterScale'), dict(type='TopDownAffine'), dict(type='ToTensor'), dict( @@ -61,6 +65,9 @@ 'flip_pairs' ]), ] +# compatible with mmpose >=v0.26.0 +if version.parse(mmpose.__version__) >= version.parse('0.26.0'): + test_pipeline.insert(1, dict(type='TopDownGetBboxCenterScale')) dataset_info = dict( dataset_name='coco', diff --git a/tests/test_codebase/test_mmpose/test_pose_detection.py b/tests/test_codebase/test_mmpose/test_pose_detection.py index 012c67f346..4a8085a63e 100644 --- a/tests/test_codebase/test_mmpose/test_pose_detection.py +++ b/tests/test_codebase/test_mmpose/test_pose_detection.py @@ -46,7 +46,6 @@ def test_create_input(): - model_cfg = load_config(model_cfg_path)[0] deploy_cfg = mmcv.Config( dict( backend_config=dict(type=Backend.ONNXRUNTIME.value), From 571b24050053cbdcc7da59db507020c1be918dd9 Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Fri, 27 May 2022 14:23:28 +0800 Subject: [PATCH 48/51] [Docs] adding new backends when using MMDeploy as a third package (#482) * update doc * refine expression * cn doc --- .../en/tutorials/how_to_support_new_backends.md | 17 +++++++++++++++++ .../tutorials/how_to_support_new_backends.md | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/docs/en/tutorials/how_to_support_new_backends.md b/docs/en/tutorials/how_to_support_new_backends.md index c18cd86148..0106a9b10e 100644 --- a/docs/en/tutorials/how_to_support_new_backends.md +++ b/docs/en/tutorials/how_to_support_new_backends.md @@ -229,3 +229,20 @@ Although the backend engines are usually implemented in C/C++, it is convenient ``` 5. Add docstring and unit tests for new code :). + + +### Support new backends using MMDeploy as a third party +Previous parts show how to add a new backend in MMDeploy, which requires changing its source codes. However, if we treat MMDeploy as a third party, the methods above are no longer efficient. To this end, adding a new backend requires us pre-install another package named `aenum`. We can install it directly through `pip install aenum`. + +After installing `aenum` successfully, we can use it to add a new backend through: +```python +from mmdeploy.utils.constants import Backend +from aenum import extend_enum + +try: + Backend.get('backend_name') +except Exception: + extend_enum(Backend, 'BACKEND', 'backend_name') +``` + +We can run the codes above before we use the rewrite logic of MMDeploy. diff --git a/docs/zh_cn/tutorials/how_to_support_new_backends.md b/docs/zh_cn/tutorials/how_to_support_new_backends.md index 07fd14c19f..b84d935744 100644 --- a/docs/zh_cn/tutorials/how_to_support_new_backends.md +++ b/docs/zh_cn/tutorials/how_to_support_new_backends.md @@ -229,3 +229,19 @@ MMDeploy 中的后端必须支持 ONNX,因此后端能直接加载“.onnx” ``` 5. 为新后端引擎代码添加相关注释和单元测试 :). + + +### 将MMDeploy作为第三方库时添加新后端 +前面的部分展示了如何在 MMDeploy 中添加新的后端,这需要更改其源代码。但是,如果我们将 MMDeploy 视为第三方,则上述方法不再有效。为此,添加一个新的后端需要我们预先安装另一个名为 `aenum` 的包。我们可以直接通过`pip install aenum`进行安装。 + +成功安装 `aenum` 后,我们可以通过以下方式使用它来添加新的后端: +```python +from mmdeploy.utils.constants import Backend +from aenum import extend_enum + +try: + Backend.get('backend_name') +except Exception: + extend_enum(Backend, 'BACKEND', 'backend_name') +``` +我们可以在使用 MMDeploy 的重写逻辑之前运行上面的代码,这就完成了新后端的添加。 From 6fa1787a04dac4ab75690de7b569b33d81d1cce1 Mon Sep 17 00:00:00 2001 From: Yifan Zhou Date: Sat, 28 May 2022 15:19:14 +0800 Subject: [PATCH 49/51] Tutorial 05: ONNX Model Editing (#517) * tutorial 05 * Upload image * resolve comments * resolve comment --- .../chapter_05_onnx_model_editing.md | 463 ++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 docs/zh_cn/tutorials/chapter_05_onnx_model_editing.md diff --git a/docs/zh_cn/tutorials/chapter_05_onnx_model_editing.md b/docs/zh_cn/tutorials/chapter_05_onnx_model_editing.md new file mode 100644 index 0000000000..f7ccf606d4 --- /dev/null +++ b/docs/zh_cn/tutorials/chapter_05_onnx_model_editing.md @@ -0,0 +1,463 @@ +# 模型部署入门教程(五):ONNX 模型的修改与调试 + +在前两期教程中,我们学习了 PyTorch 模型转 ONNX 模型的方法,了解了如何在原生算子表达能力不足时,为 PyTorch 或 ONNX 自定义算子。一直以来,我们都是通过 PyTorch 来导出 ONNX 模型的,基本没有单独探究过 ONNX 模型的构造知识。 + +不知道大家会不会有这样一些疑问:ONNX 模型在底层是用什么格式存储的?如何不依赖深度学习框架,只用 ONNX 的 API 来构造一个 ONNX 模型?如果没有源代码,只有一个 ONNX 模型,该如何对这个模型进行调试?这篇教程可以解答大家的这些问题。 + +在这期教程里,我们将围绕 ONNX 这一套神经网络定义标准本身,探究 ONNX 模型的构造、读取、子模型提取、调试。首先,我们会学习 ONNX 的底层表示方式。之后,我们会用 ONNX API 构造和读取模型。最后,我们会利用 ONNX 提供的子模型提取功能,学习如何调试 ONNX 模型。 + +## ONNX 的底层实现 +### ONNX 的存储格式 +ONNX 在底层是用 **Protobuf** 定义的。Protobuf,全称 Protocol Buffer,是 Google 提出的一套表示和序列化数据的机制。使用 Protobuf 时,用户需要先写一份数据定义文件,再根据这份定义文件把数据存储进一份二进制文件。可以说,数据定义文件就是数据类,二进制文件就是数据类的实例。 +这里给出一个 Protobuf 数据定义文件的例子: + +```protobuf +message Person { + required string name = 1; + required int32 id = 2; + optional string email = 3; +} +``` + +这段定义表示在 `Person` 这种数据类型中,必须包含 `name`、`id` 这两个字段,选择性包含 `email` 字段。根据这份定义文件,用户就可以选择一种编程语言,定义一个含有成员变量 `name`、`id`、`email` 的 `Person` 类,把这个类的某个实例用 Protobuf 存储成二进制文件;反之,用户也可以用二进制文件和对应的数据定义文件,读取出一个 `Person` 类的实例。 + +而对于 ONNX ,它的 Protobuf 数据定义文件在其[开源库](https://github.com/onnx/onnx/tree/main/onnx)中,这些文件定义了神经网络中模型、节点、张量的数据类型规范;而数据定义文件对应的二进制文件就是我们熟悉的“.onnx"文件,每一个 ".onnx" 文件按照数据定义规范,存储了一个神经网络的所有相关数据。直接用 Protobuf 生成 ONNX 模型还是比较麻烦的。幸运的是,ONNX 提供了很多实用 API,我们可以在完全不了解 Protobuf 的前提下,构造和读取 ONNX 模型。 + + +### ONNX 的结构定义 + +在用 API 对 ONNX 模型进行操作之前,我们还需要先了解一下 ONNX 的结构定义规则,学习一下 ONNX 在 Protobuf 定义文件里是怎样描述一个神经网络的。 + +回想一下,神经网络本质上是一个计算图。计算图的节点是算子,边是参与运算的张量。而通过可视化 ONNX 模型,我们知道 ONNX 记录了所有算子节点的属性信息,并把参与运算的张量信息存储在算子节点的输入输出信息中。事实上,ONNX 模型的结构可以用类图大致表示如下: + +![](https://user-images.githubusercontent.com/47652064/170020689-9a069a63-a4b7-44c0-8833-59e07c52fd5e.jpg) + +如图所示,一个 ONNX 模型可以用 `ModelProto` 类表示。`ModelProto` 包含了版本、创建者等日志信息,还包含了存储计算图结构的 `graph`。`GraphProto` 类则由输入张量信息、输出张量信息、节点信息组成。张量信息 `ValueInfoProto` 类包括张量名、基本数据类型、形状。节点信息 `NodeProto` 类包含了算子名、算子输入张量名、算子输出张量名。 +让我们来看一个具体的例子。假如我们有一个描述 `output=a*x+b` 的 ONNX 模型 `model`,用 `print(model)` 可以输出以下内容: + +```python +ir_version: 8 +graph { + node { + input: "a" + input: "x" + output: "c" + op_type: "Mul" + } + node { + input: "c" + input: "b" + output: "output" + op_type: "Add" + } + name: "linear_func" + input { + name: "a" + type { + tensor_type { + elem_type: 1 + shape { + dim {dim_value: 10} + dim {dim_value: 10} + } + } + } + } + input { + name: "x" + type { + tensor_type { + elem_type: 1 + shape { + dim {dim_value: 10} + dim {dim_value: 10} + } + } + } + } + input { + name: "b" + type { + tensor_type { + elem_type: 1 + shape { + dim {dim_value: 10} + dim {dim_value: 10} + } + } + } + } + output { + name: "output" + type { + tensor_type { + elem_type: 1 + shape { + dim { dim_value: 10} + dim { dim_value: 10} + } + } + } + } +} +opset_import {version: 15} +``` + +对应上文中的类图,这个模型的信息由 `ir_version`,`opset_import` 等全局信息和 `graph` 图信息组成。而 `graph` 包含一个乘法节点、一个加法节点、三个输入张量 `a, x, b` 以及一个输出张量 `output`。在下一节里,我们会用 API 构造出这个模型,并输出这段结果。 + +## 读写 ONNX 模型 + +### 构造 ONNX 模型 + +在上一小节中,我们知道了 ONNX 模型是按以下的结构组织起来的: + +* ModelProto + * GraphProto + * NodeProto + * ValueInfoProto + +现在,让我们抛开 PyTorch,尝试完全用 ONNX 的 Python API 构造一个描述线性函数 `output=a*x+b` 的 ONNX 模型。我们将根据上面的结构,自底向上地构造这个模型。 + +首先,我们可以用 `helper.make_tensor_value_info` 构造出一个描述张量信息的 `ValueInfoProto` 对象。如前面的类图所示,我们要传入张量名、张量的基本数据类型、张量形状这三个信息。在 ONNX 中,不管是输入张量还是输出张量,它们的表示方式都是一样的。因此,这里我们用类似的方式为三个输入 `a, x, b` 和一个输出 `output` 构造 `ValueInfoProto` 对象。如下面的代码所示: + +```python +import onnx +from onnx import helper +from onnx import TensorProto + +a = helper.make_tensor_value_info('a', TensorProto.FLOAT, [10, 10]) +x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 10]) +b = helper.make_tensor_value_info('b', TensorProto.FLOAT, [10, 10]) +output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [10, 10]) +``` + +之后,我们要构造算子节点信息 `NodeProto`,这可以通过在 `helper.make_node` 中传入算子类型、输入张量名、输出张量名这三个信息来实现。我们这里先构造了描述 `c=a*x` 的乘法节点,再构造了 `output=c+b` 的加法节点。如下面的代码所示: + +```python +mul = helper.make_node('Mul', ['a', 'x'], ['c']) +add = helper.make_node('Add', ['c', 'b'], ['output']) +``` + +在计算机中,图一般是用一个节点集和一个边集表示的。而 ONNX 巧妙地把边的信息保存在了节点信息里,省去了保存边集的步骤。在 ONNX 中,如果某节点的输入名和之前某节点的输出名相同,就默认这两个节点是相连的。如上面的例子所示:`Mul` 节点定义了输出 `c`,`Add` 节点定义了输入 `c`,则 `Mul` 节点和 `Add` 节点是相连的。 + +正是因为有这种边的隐式定义规则,所以 ONNX 对节点的输入有一定的要求:一个节点的输入,要么是整个模型的输入,要么是之前某个节点的输出。如果我们把 `a, x, b` 中的某个输入节点从计算图中拿出(这个操作会在之后的代码中介绍),或者把 `Mul` 的输出从 `c` 改成 `d`,则最终的 ONNX 模型都是不满足标准的。 + +> 一个不满足标准的 ONNX 模型可能无法被推理引擎正确识别。ONNX 提供了 API `onnx.checker.check_model` 来判断一个 ONNX 模型是否满足标准。 + +接下来,我们用 `helper.make_graph` 来构造计算图 `GraphProto`。`helper.make_graph` 函数需要传入节点、图名称、输入张量信息、输出张量信息这 4 个参数。如下面的代码所示,我们把之前构造出来的 `NodeProto` 对象和 `ValueInfoProto` 对象按照顺序传入即可。 + +```python +graph = helper.make_graph([mul, add], 'linear_func', [a, x, b], [output]) +``` + +这里 `make_graph` 的节点参数有一个要求:计算图的节点必须以拓扑序给出。 + +> 拓扑序是与有向图的相关的数学概念。如果按拓扑序遍历所有节点的话,能保证每个节点的输入都能在之前节点的输出里找到(对于 ONNX 模型,我们把计算图的输入张量也看成“之前的输出”)。 + +如果对这个概念不熟也没有关系,我们以刚刚构造出来的这个计算图为研究对象,通过下图展示的两个例子来直观理解拓扑序。 + +![](https://user-images.githubusercontent.com/47652064/170644483-160313b4-b000-4ad1-85b5-816278c7df80.png) + +这里我们只关注 `Mul` 和 `Add` 节点以及它们之间的边 `c`。在情况 1 中:如果我们的节点以 `[Mul, Add]` 顺序给出,那么遍历到 `Add` 时,它的输入 `c` 可以在之前的 `Mul` 的输出中找到。但是,如情况 2 所示:如果我们的节点以 `[Add, Mul]` 的顺序给出,那么 `Add` 就找不到输入边,计算图也无法成功构造出来了。这里的 `[Mul, Add]` 就是符合有向图的拓扑序的,而 `[Add, Mul]` 则不满足。 + +最后,我们用 `helper.make_model` 把计算图 `GraphProto` 封装进模型 `ModelProto` 里,一个 ONNX 模型就构造完成了。`make_model` 函数中还可以添加模型制作者、版本等信息,为了简单起见,我们没有添加额外的信息。如下面的代码所示: + +```python +model = helper.make_model(graph) +``` + +构造完模型之后,我们用下面这三行代码来检查模型正确性、把模型以文本形式输出、存储到一个 ".onnx" 文件里。这里用 `onnx.checker.check_model` 来检查模型是否满足 ONNX 标准是必要的,因为无论模型是否满足标准,ONNX 都允许我们用 onnx.save 存储模型。我们肯定不希望生成一个不满足标准的模型。 + +```python +onnx.checker.check_model(model) +print(model) +onnx.save(model, 'linear_func.onnx') +``` + +成功执行这些代码的话,程序会以文本格式输出模型的信息,其内容应该和我们在上一节展示的输出一样。 + +整理一下,用 ONNX Python API 构造模型的代码如下: + +```python +import onnx +from onnx import helper +from onnx import TensorProto + +# input and output +a = helper.make_tensor_value_info('a', TensorProto.FLOAT, [10, 10]) +x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 10]) +b = helper.make_tensor_value_info('b', TensorProto.FLOAT, [10, 10]) +output = helper.make_tensor_value_info('output', TensorProto.FLOAT, [10, 10]) + +# Mul +mul = helper.make_node('Mul', ['a', 'x'], ['c']) + +# Add +add = helper.make_node('Add', ['c', 'b'], ['output']) + +# graph and model +graph = helper.make_graph([mul, add], 'linear_func', [a, x, b], [output]) +model = helper.make_model(graph) + +# save model +onnx.checker.check_model(model) +print(model) +onnx.save(model, 'linear_func.onnx') +``` + +老规矩,我们可以用 ONNX Runtime 运行模型,来看看模型是否正确: + +```python +import onnxruntime +import numpy as np + +sess = onnxruntime.InferenceSession('linear_func.onnx') +a = np.random.rand(10, 10).astype(np.float32) +b = np.random.rand(10, 10).astype(np.float32) +x = np.random.rand(10, 10).astype(np.float32) + +output = sess.run(['output'], {'a': a, 'b': b, 'x': x})[0] + +assert np.allclose(output, a * x + b) +``` + +一切顺利的话,这段代码不会有任何报错信息。这说明我们的模型等价于执行 a * x + b 这个计算。 + + +### 读取并修改 ONNX 模型 +通过用 API 构造 ONNX 模型,我们已经彻底搞懂了 ONNX 由哪些模块组成。现在,让我们看看该如何读取现有的".onnx"文件并从中提取模型信息。 + +首先,我们可以用下面的代码读取一个 ONNX 模型: + +```python +import onnx +model = onnx.load('linear_func.onnx') +print(model) +``` + +之前在输出模型时,我们传给 `onnx.save` 的是一个 `ModelProto` 的对象。同理,用上面的 `onnx.load` 读取 ONNX 模型时,我们收获的也是一个 `ModelProto` 的对象。输出这个对象后,我们应该得到和之前完全相同的输出。 +接下来,我们来看看怎么把图 `GraphProto`、节点 `NodeProto`、张量信息 `ValueInfoProto` 读取出来: + +```python +graph = model.graph +node = graph.node +input = graph.input +output = graph.output +print(node) +print(input) +print(output) +``` + +使用如上这些代码,我们可以分别访问模型的图、节点、张量信息。这里大家或许会有疑问:该怎样找出 `graph.node,graph.input` 中 `node, input` 这些属性名称呢?其实,属性的名称就写在每个对象的输出里。我们以 `print(node)` 的输出为例: + +```python +[input: "a" +input: "x" +output: "c" +op_type: "Mul" +, input: "c" +input: "b" +output: "output" +op_type: "Add" +] +``` + +在这段输出中,我们能看出 `node` 其实就是一个列表,列表中的对象有属性 `input, output, op_type`(这里 `input` 也是一个列表,它包含的两个元素都显示出来了)。我们可以用下面的代码来获取 `node` 里第一个节点 `Mul` 的属性: + +```python +node_0 = node[0] +node_0_inputs = node_0.input +node_0_outputs = node_0.output +input_0 = node_0_inputs[0] +input_1 = node_0_inputs[1] +output = node_0_outputs[0] +op_type = node_0.op_type + +print(input_0) +print(input_1) +print(output) +print(op_type) + +# Output +""" +a +x +c +Mul +""" +``` + +当我们想知道 ONNX 模型某数据对象有哪些属性时,我们不必去翻 ONNX 文档,只需要先把数据对象输出一下,然后在输出结果找出属性名即可。 + +读取完 ONNX 模型的信息后,修改 ONNX 模型就是一件很轻松的事了。我们既可以按照上一小节的模型构造方法,新建节点和张量信息,与原有模型组合成一个新的模型,也可以在不违反 ONNX 规范的前提下直接修改某个数据对象的属性。 + +这里我们来看一个直接修改模型属性的例子: + +```python +import onnx +model = onnx.load('linear_func.onnx') + +node = model.graph.node +node[1].op_type = 'Sub' + +onnx.checker.check_model(model) +onnx.save(model, 'linear_func_2.onnx') +``` + +在读入之前的 `linear_func.onnx` 模型后,我们可以直接修改第二个节点的类型 `node[1].op_type`,把加法变成减法。这样,我们的模型描述的是 `a * x - b` 这个线性函数。大家感兴趣的话,可以用 ONNX Runtime 运行新模型 `linear_func_2.onnx`,来验证一下它和 `a * x - b` 是否等价。 + +## 调试 ONNX 模型 +在实际部署中,如果用深度学习框架导出的 ONNX 模型出了问题,一般要通过修改框架的代码来解决,而不会从 ONNX 入手,我们把 ONNX 模型当成一个不可修改的黑盒看待。 +现在,我们已经深入学习了 ONNX 的原理,可以尝试对 ONNX 模型本身进行调试了。在这一节里,让我们看看该如何巧妙利用 ONNX 提供的子模型提取功能,对 ONNX 模型进行调试。 + +### 子模型提取 +ONNX 官方为开发者提供了子模型提取(extract)的功能。子模型提取,顾名思义,就是从一个给定的 ONNX 模型中,拿出一个子模型。这个子模型的节点集、边集都是原模型中对应集合的子集。让我们来用 PyTorch 导出一个复杂一点的 ONNX 模型,并在它的基础上执行提取操作: + +```python +import torch + +class Model(torch.nn.Module): + + def __init__(self): + super().__init__() + self.convs1 = torch.nn.Sequential(torch.nn.Conv2d(3, 3, 3), + torch.nn.Conv2d(3, 3, 3), + torch.nn.Conv2d(3, 3, 3)) + self.convs2 = torch.nn.Sequential(torch.nn.Conv2d(3, 3, 3), + torch.nn.Conv2d(3, 3, 3)) + self.convs3 = torch.nn.Sequential(torch.nn.Conv2d(3, 3, 3), + torch.nn.Conv2d(3, 3, 3)) + self.convs4 = torch.nn.Sequential(torch.nn.Conv2d(3, 3, 3), + torch.nn.Conv2d(3, 3, 3), + torch.nn.Conv2d(3, 3, 3)) + def forward(self, x): + x = self.convs1(x) + x1 = self.convs2(x) + x2 = self.convs3(x) + x = x1 + x2 + x = self.convs4(x) + return x + +model = Model() +input = torch.randn(1, 3, 20, 20) + +torch.onnx.export(model, input, 'whole_model.onnx') +``` + +这个模型的可视化结果如下图所示(提取子模型需要输入边的序号,为了大家方面阅读,这幅图标出了之后要用到的边的序号): + +![](https://user-images.githubusercontent.com/47652064/170644578-bcaaa2aa-bdd4-4cb3-856b-c6d621273357.png) + + +> 在前面的章节中,我们学过,ONNX 的边用同名张量表示的。也就是说,这里的边序号,实际上是前一个节点的输出张量序号和后一个节点的输入张量序号。由于这个模型是用 PyTorch 导出的,这些张量序号都是 PyTorch 自动生成的。 + +接着,我们可以下面的代码提取出一个子模型: + +```python +import onnx + +onnx.utils.extract_model('whole_model.onnx', 'partial_model.onnx', ['22'], ['28']) +``` + +子模型的可视化结果如下图所示: + +![](https://user-images.githubusercontent.com/47652064/170644616-42cd9d11-1525-49b2-b302-b96e985c5e79.png) + +通过观察代码和输出图,应该不难猜出这段代码的作用是把原计算图从边 22 到边 28 的子图提取出来,并组成一个子模型。`onnx.utils.extract_model` 就是完成子模型提取的函数,它的参数分别是原模型路径、输出模型路径、子模型的输入边(输入张量)、子模型的输出边(输出张量)。 + +直观地来看,子模型提取就是把输入边到输出边之间的全部节点都取出来。那么,这个功能在使用上有什么限制呢?基于 `whole_model.onnx`, 我们来看一看三个子模型提取的示例。 + +#### 添加额外输出 + +我们在提取时新设定了一个输出张量,如下面的代码所示: + +```python +onnx.utils.extract_model('whole_model.onnx', 'submodel_1.onnx', ['22'], ['27', '31']) +``` + +我们可以看到子模型会添加一条把张量输出的新边,如下图所示: + +![](https://user-images.githubusercontent.com/47652064/170644722-d63156e5-cd74-4faa-ac0a-ce408be949eb.png) + +#### 添加冗余输入 + +如果我们还是像开始一样提取边 22 到边 28 之间的子模型,但是多添加了一个输入 input.1,那么提取出的子模型会有一个冗余的输入 input.1,如下面的代码所示: + +```python +onnx.utils.extract_model('whole_model.onnx', 'submodel_2.onnx', ['22', 'input.1'], ['28']) +``` + +从下图中可以看出:无论给这个输入传入什么值,都不会影响子模型的输出。可以认为如果只用子模型的部分输入就能得到输出,那么那些”较早“的多出来的输入就是冗余的。 + +![](https://user-images.githubusercontent.com/47652064/170644751-c8100d04-585b-4f93-9ed0-7a77dca88c16.png) + +#### 输入信息不足 + +这次,我们尝试提取的子模型输入是边 24,输出是边 28。如下面的代码和图所示: + +```python +# Error +onnx.utils.extract_model('whole_model.onnx', 'submodel_3.onnx', ['24'], ['28']) +``` + +![](https://user-images.githubusercontent.com/47652064/170644773-627af9d0-8c3f-447c-9fbf-dc63a31c40ab.png) + +从图中可以看出,想通过边 24 计算边 28 的结果,至少还需要输入边 26,或者更上面的边。仅凭借边 24 是无法计算出边 28 的结果的,因此这样提取子模型会报错。 + +通过上面几个使用示例,我们可以整理出子模型提取的实现原理:新建一个模型,把给定的输入和输出填入。之后把图的所有有向边反向,从输出边开始遍历节点,碰到输入边则停止,把这样遍历得到的节点做为子模型的节点。 + +如果还没有彻底弄懂这个提取原理,没关系,我们只要尽量保证在填写子模型的输入输出时,让输出恰好可以由输入决定即可。 + +### 输出 ONNX 中间节点的值 + +在使用 ONNX 模型时,最常见的一个需求是能够用推理引擎输出中间节点的值。这多见于深度学习框架模型和 ONNX 模型的精度对齐中,因为只要能够输出中间节点的值,就能定位到精度出现偏差的算子。我们来看看如何用子模型提取实现这一任务。 + +在刚刚的第一个子模型提取示例中,我们添加了一条原来模型中不存在的输出边。用同样的原理,我们可以在保持原有输入输出不变的同时,新增加一些输出,提取出一个能输出中间节点的”子模型“。例如: + +```python + onnx.utils.extract_model('whole_model.onnx', 'more_output_model.onnx', ['input.1'], ['31', '23', '25', '27']) +``` + +在这个子模型中,我们在保持原有的输入 `input.1`,输出 `31` 的同时,把其他几个边加入了输出中。如下图所示: + +![](https://user-images.githubusercontent.com/47652064/170020845-6e1cb45b-962a-40ba-a17b-e47b0bdcd3bf.png) + +这样,用 ONNX Runtime 运行 `more_output_model.onnx` 这个模型时,我们就能得到更多的输出了。 +为了方便调试,我们还可以把原模型拆分成多个互不相交的子模型。这样,在每次调试时,可以只对原模型的部分子模块调试。比如: + +```python +onnx.utils.extract_model('whole_model.onnx', 'debug_model_1.onnx', ['input.1'], ['23']) +onnx.utils.extract_model('whole_model.onnx', 'debug_model_2.onnx', ['23'], ['25']) +onnx.utils.extract_model('whole_model.onnx', 'debug_model_3.onnx', ['23'], ['27']) +onnx.utils.extract_model('whole_model.onnx', 'debug_model_4.onnx', ['25', '27'], ['31']) +``` + +在这个例子中,我们把原来较为复杂的模型拆成了四个较为简单的子模型,如下图所示。在调试时,我们可以先调试顶层的子模型,确认顶层子模型无误后,把它的输出做为后面子模型的输入。 + +比如对于这些子模型,我们可以先调试第一个子模型,并存储输出 23。之后把张量 23 做为第二个和第三个子模型的输入,调试这两个模型。最后用同样方法调试第四个子模型。可以说,有了子模型提取功能,哪怕是面对一个庞大的模型,我们也能够从中提取出有问题的子模块,细致地只对这个子模块调试。 + +![](https://user-images.githubusercontent.com/47652064/170020865-e4d59a4f-7c57-4a12-b300-b7f5da0e1b80.png) + +--- + +子模型提取固然是一个便利的 ONNX 调试工具。但是,在实际的情况中,我们一般是用 PyTorch 等框架导出 ONNX 模型。这里有两个问题: + +1. 一旦 PyTorch 模型改变,ONNX 模型的边序号也会改变。这样每次提取同样的子模块时都要重新去 ONNX 模型里查序号,如此繁琐的调试方法是不会在实践中采用的。 +2. 即使我们能保证 ONNX 的边序号不发生改变,我们也难以把 PyTorch 代码和 ONNX 节点对应起来——当模型结构变得十分复杂时,要识别 ONNX 中每个节点的含义是不可能的。 + +MMDeploy 为 PyTorch 模型添加了模型分块功能。使用这个功能,我们可以通过只修改 PyTorch 模型的实现代码来把原模型导出成多个互不相交的子 ONNX 模型。我们会在后续教程中对其介绍。 + +## 总结 + +在这篇教程中,我们抛开了 PyTorch,学习了 ONNX 模型本身的知识。老规矩,我们来总结一下这篇教程的知识点: + +* ONNX 使用 Protobuf 定义规范和序列化模型。 +* 一个 ONNX 模型主要由 `ModelProto`,`GraphProto`,`NodeProto`,`ValueInfoProto` 这几个数据类的对象组成。 +* 使用 `onnx.helper.make_xxx`,我们可以构造 ONNX 模型的数据对象。 +* `onnx.save()` 可以保存模型,`onnx.load()` 可以读取模型,`onnx.checker.check_model()` 可以检查模型是否符合规范。 +* `onnx.utils.extract_model()` 可以从原模型中取出部分节点,和新定义的输入、输出边构成一个新的子模型。 +* 利用子模型提取功能,我们可以输出原 ONNX 模型的中间结果,实现对 ONNX 模型的调试。 + +至此,我们对 ONNX 相关知识的学习就告一段落了。回顾一下,我们先学习了 PyTorch 转 ONNX 有关 API 的用法;接着,我们学习了如何用自定义算子解决 PyTorch 和 ONNX 表达能力不足的问题;最后我们单独学习了 ONNX 模型的调试方法。通过对 ONNX 由浅入深的学习,我们基本可以应对模型部署中和 ONNX 有关的绝大多数问题了。 + +如果大家想了解更多有关 ONNX API 的知识,可以去阅读 ONNX 的[官方 Python API 文档](https://github.com/onnx/onnx/blob/main/docs/PythonAPIOverview.md)。 From 182cc517463908127728282d2775cf2579c38758 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 1 Jun 2022 19:48:29 +0800 Subject: [PATCH 50/51] fix pspnet torchscript conversion (#538) * fix pspnet torchscript conversion * resolve comment * add IR to rewrite --- .../codebase/mmseg/models/decode_heads/psp_head.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mmdeploy/codebase/mmseg/models/decode_heads/psp_head.py b/mmdeploy/codebase/mmseg/models/decode_heads/psp_head.py index c792237029..210e6c7ad5 100644 --- a/mmdeploy/codebase/mmseg/models/decode_heads/psp_head.py +++ b/mmdeploy/codebase/mmseg/models/decode_heads/psp_head.py @@ -4,11 +4,11 @@ from mmseg.ops import resize from mmdeploy.core import FUNCTION_REWRITER -from mmdeploy.utils import is_dynamic_shape +from mmdeploy.utils import IR, get_root_logger, is_dynamic_shape @FUNCTION_REWRITER.register_rewriter( - func_name='mmseg.models.decode_heads.psp_head.PPM.forward') + func_name='mmseg.models.decode_heads.psp_head.PPM.forward', ir=IR.ONNX) def ppm__forward(ctx, self, x): """Rewrite `forward` for default backend. @@ -34,9 +34,10 @@ def ppm__forward(ctx, self, x): for ppm in self: if isinstance(ppm[0], nn.AdaptiveAvgPool2d) and \ ppm[0].output_size != 1: - assert not is_dynamic_flag, 'AdaptiveAvgPool2d is not \ - supported with dynamic shape in backends' - + if is_dynamic_flag: + logger = get_root_logger() + logger.warning('`AdaptiveAvgPool2d` would be ' + 'replaced to `AvgPool2d` explicitly') # replace AdaptiveAvgPool2d with AvgPool2d explicitly output_size = 2 * [ppm[0].output_size] k = [int(size[i] / output_size[i]) for i in range(0, len(size))] From 2a0fcb6e71ff1ec937079e1e7f577eb500123a20 Mon Sep 17 00:00:00 2001 From: sanjaypavo <93761297+sanjaypavo@users.noreply.github.com> Date: Tue, 7 Jun 2022 11:51:17 +0530 Subject: [PATCH 51/51] changing the onnxwrapper script for gpu issue (#532) * changing the onnxwrapper script * gpu_issue * Update wrapper.py * Update wrapper.py * Update runtime.txt * Update runtime.txt * Update wrapper.py --- mmdeploy/backend/onnxruntime/wrapper.py | 11 +++++------ requirements/runtime.txt | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mmdeploy/backend/onnxruntime/wrapper.py b/mmdeploy/backend/onnxruntime/wrapper.py index 4239853e2d..daac6bf515 100644 --- a/mmdeploy/backend/onnxruntime/wrapper.py +++ b/mmdeploy/backend/onnxruntime/wrapper.py @@ -50,9 +50,9 @@ def __init__(self, logger.warning(f'The library of onnxruntime custom ops does \ not exist: {ort_custom_op_path}') device_id = parse_device_id(device) - is_cuda_available = ort.get_device() == 'GPU' - providers = [('CUDAExecutionProvider', {'device_id': device_id})] \ - if is_cuda_available else ['CPUExecutionProvider'] + providers = ['CPUExecutionProvider'] \ + if device == 'cpu' else \ + [('CUDAExecutionProvider', {'device_id': device_id})] sess = ort.InferenceSession( onnx_file, session_options, providers=providers) if output_names is None: @@ -60,8 +60,7 @@ def __init__(self, self.sess = sess self.io_binding = sess.io_binding() self.device_id = device_id - self.is_cuda_available = is_cuda_available - self.device_type = 'cuda' if is_cuda_available else 'cpu' + self.device_type = 'cpu' if device == 'cpu' else 'cuda' super().__init__(output_names) def forward(self, inputs: Dict[str, @@ -77,7 +76,7 @@ def forward(self, inputs: Dict[str, for name, input_tensor in inputs.items(): # set io binding for inputs/outputs input_tensor = input_tensor.contiguous() - if not self.is_cuda_available: + if self.device_type == 'cpu': input_tensor = input_tensor.cpu() # Avoid unnecessary data transfer between host and device element_type = input_tensor.new_zeros( diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 6114dfc58f..aa7aec20ea 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -2,5 +2,6 @@ h5py matplotlib numpy onnx>=1.8.0 +protobuf==3.20.0 six terminaltables
93.84
ShuffleNetV1 1.0xShuffleNetV1 Classification top-1 68.1368.13 67.71 68.11$MMCLS_DIR/configs/shufflenet_v1/shufflenet_v1_1x_b64x16_linearlr_bn_nowd_imagenet.py$MMCLS_DIR/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py
top-587.80
ShuffleNetV2 1.0xShuffleNetV2 Classification top-1 69.5569.54 69.10 69.54$MMCLS_DIR/configs/shufflenet_v2/shufflenet_v2_1x_b64x16_linearlr_bn_nowd_imagenet.py$MMCLS_DIR/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py
top-571.87 70.91 71.84$MMEDIT_DIR/configs/restorers/real_esrgan/realesrnet_c64b23g32_12x4_lr2e-4_1000k_df2k_ost.py$MMEDIT_DIR/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py
top-5