PaddlePaddle · zhupengyang · Mar 22, 2021 · Mar 19, 2021
@@ -80,8 +80,8 @@ add_kernel(generate_proposals_v2_compute_arm ARM extra SRCS generate_proposals_v
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} collect_fpn_proposals_compute_host)
+add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} distribute_fpn_proposals_compute_host)
 add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pixel_shuffle_compute_arm ARM extra SRCS pixel_shuffle_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(scatter_compute_arm ARM extra SRCS scatter_compute.cc DEPS ${lite_kernel_deps} math_arm)

@@ -12,168 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/collect_fpn_proposals_compute.h"
-#include <numeric>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-struct ScoreWithID {
-  float score;
-  int batch_id;
-  int index;
-  int level;
-  ScoreWithID() {
-    batch_id = -1;
-    index = -1;
-    level = -1;
-  }
-  ScoreWithID(float score_, int batch_id_, int index_, int level_) {
-    score = score_;
-    batch_id = batch_id_;
-    index = index_;
-    level = level_;
-  }
-};
-
-static inline bool CompareByScore(ScoreWithID a, ScoreWithID b) {
-  return a.score >= b.score;
-}
-
-static inline bool CompareByBatchid(ScoreWithID a, ScoreWithID b) {
-  return a.batch_id < b.batch_id;
-}
-
-void CollectFpnProposalsCompute::Run() {
-  auto& param = Param<operators::CollectFpnProposalsParam>();
-  auto multi_layer_rois = param.multi_level_rois;
-  auto multi_layer_scores = param.multi_level_scores;
-  auto* fpn_rois = param.fpn_rois;
-  int post_nms_topN = param.post_nms_topN;
-
-  if (multi_layer_rois.size() != multi_layer_scores.size()) {
-    LOG(FATAL) << "multi_layer_rois.size() should be equan to "
-                  "multi_layer_scores.size()";
-  }
-
-  size_t num_fpn_level = multi_layer_rois.size();
-  std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
-  int num_size = param.multi_rois_num.size();
-  for (size_t i = 0; i < num_fpn_level; ++i) {
-    int all_rois = 0;
-    if (num_size == 0) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
-    } else {
-      const int* cur_rois_num = param.multi_rois_num[i]->data<int>();
-      all_rois = std::accumulate(
-          cur_rois_num, cur_rois_num + param.multi_rois_num[i]->numel(), 0);
-    }
-    integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
-  }
-  const int batch_size = (num_size == 0)
-                             ? multi_layer_rois[0]->lod().back().size() - 1
-                             : param.multi_rois_num[0]->numel();
-  std::vector<ScoreWithID> scores_of_all_rois(
-      integral_of_all_rois[num_fpn_level], ScoreWithID());
-  for (int i = 0; i < num_fpn_level; ++i) {
-    const float* cur_level_scores = multi_layer_scores[i]->data<float>();
-    int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-    auto cur_scores_lod = multi_layer_scores[i]->lod().back();
-    int cur_batch_id = 0;
-    int pre_num = 0;
-    for (int j = 0; j < cur_level_num; ++j) {
-      if (num_size == 0) {
-        auto cur_scores_lod = multi_layer_scores[i]->lod().back();
-        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
-        }
-      } else {
-        const int* rois_num_data = param.multi_rois_num[i]->data<int>();
-        if (j >= pre_num + rois_num_data[cur_batch_id]) {
-          pre_num += rois_num_data[cur_batch_id];
-          cur_batch_id++;
-        }
-      }
-      int cur_index = j + integral_of_all_rois[i];
-      scores_of_all_rois[cur_index].score = cur_level_scores[j];
-      scores_of_all_rois[cur_index].index = j;
-      scores_of_all_rois[cur_index].level = i;
-      scores_of_all_rois[cur_index].batch_id = cur_batch_id;
-    }
-  }
-
-  // keep top post_nms_topN rois, sort the rois by the score
-  if (post_nms_topN > integral_of_all_rois[num_fpn_level]) {
-    post_nms_topN = integral_of_all_rois[num_fpn_level];
-  }
-  std::stable_sort(
-      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByScore);
-  scores_of_all_rois.resize(post_nms_topN);
-  // sort by batch id
-  std::stable_sort(
-      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByBatchid);
-  // create a pointer array
-  std::vector<const float*> multi_fpn_rois_data(num_fpn_level);
-  for (int i = 0; i < num_fpn_level; ++i) {
-    multi_fpn_rois_data[i] = multi_layer_rois[i]->data<float>();
-  }
-
-  // initialize the outputs
-  const int kBoxDim = 4;
-  auto fpn_rois_data = fpn_rois->mutable_data<float>();
-  std::vector<uint64_t> lod0(1, 0);
-  int cur_batch_id = 0;
-  std::vector<int64_t> num_per_batch;
-  int pre_idx = 0;
-  int cur_num = 0;
-  for (int i = 0; i < post_nms_topN; ++i) {
-    int cur_fpn_level = scores_of_all_rois[i].level;
-    int cur_level_index = scores_of_all_rois[i].index;
-    std::memcpy(fpn_rois_data,
-                multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim,
-                kBoxDim * sizeof(float));
-    fpn_rois_data += kBoxDim;
-    if (scores_of_all_rois[i].batch_id != cur_batch_id) {
-      cur_batch_id = scores_of_all_rois[i].batch_id;
-      lod0.emplace_back(i);
-      cur_num = i - pre_idx;
-      pre_idx = i;
-      num_per_batch.emplace_back(cur_num);
-    }
-  }
-  num_per_batch.emplace_back(post_nms_topN - pre_idx);
-  if (param.rois_num) {
-    int* rois_num_data = param.rois_num->mutable_data<int>();
-    for (int i = 0; i < batch_size; i++) {
-      rois_num_data[i] = num_per_batch[i];
-    }
-  }
-  lod0.emplace_back(post_nms_topN);
-  lite::LoD lod;
-  lod.emplace_back(lod0);
-  fpn_rois->set_lod(lod);
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "lite/kernels/host/collect_fpn_proposals_compute.h"
 
 REGISTER_LITE_KERNEL(collect_fpn_proposals,
                      kARM,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::arm::CollectFpnProposalsCompute,
+                     paddle::lite::kernels::host::CollectFpnProposalsCompute,
                      def)
     .BindInput("MultiLevelRois", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("MultiLevelScores", {LiteType::GetTensorTy(TARGET(kARM))})

@@ -12,167 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/distribute_fpn_proposals_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-const int kBoxDim = 4;
-
-template <typename T>
-static inline T BBoxArea(const T* box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-inline std::vector<uint64_t> GetLodFromRoisNum(const Tensor* rois_num) {
-  std::vector<uint64_t> rois_lod;
-  auto* rois_num_data = rois_num->data<int>();
-
-  rois_lod.push_back(static_cast<uint64_t>(0));
-  for (int i = 0; i < rois_num->numel(); ++i) {
-    rois_lod.push_back(rois_lod.back() +
-                       static_cast<uint64_t>(rois_num_data[i]));
-  }
-  return rois_lod;
-}
-
-void DistributeFpnProposalsCompute::Run() {
-  auto& param = Param<operators::DistributeFpnProposalsParam>();
-  const lite::Tensor* fpn_rois = param.fpn_rois;
-  std::vector<lite::Tensor*> multi_fpn_rois = param.multi_fpn_rois;
-  lite::Tensor* restore_index = param.restore_index;
-  int min_level = param.min_level;
-  int max_level = param.max_level;
-  int refer_level = param.refer_level;
-  int refer_scale = param.refer_scale;
-  int num_level = max_level - min_level + 1;
-
-  std::vector<uint64_t> fpn_rois_lod;
-  int fpn_rois_num;
-  if (param.rois_num) {
-    fpn_rois_lod = GetLodFromRoisNum(param.rois_num);
-  } else {
-    fpn_rois_lod = fpn_rois->lod().back();
-  }
-  fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
-
-  std::vector<int> target_level;
-  // record the number of rois in each level
-  std::vector<int> num_rois_level(num_level, 0);
-  std::vector<int> num_rois_level_integral(num_level + 1, 0);
-  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-    auto fpn_rois_slice =
-        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
-                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
-    const float* rois_data = fpn_rois_slice.data<float>();
-    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
-      // get the target level of current rois
-      float roi_scale = std::sqrt(BBoxArea(rois_data, false));
-      int tgt_lvl =
-          std::floor(log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
-                     refer_level);
-      tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
-      target_level.push_back(tgt_lvl);
-      num_rois_level[tgt_lvl - min_level]++;
-      rois_data += kBoxDim;
-    }
-  }
-  // define the output rois
-  // pointer which point to each level fpn rois
-  std::vector<float*> multi_fpn_rois_data(num_level);
-  // lod0 which will record the offset information of each level rois
-  std::vector<std::vector<uint64_t>> multi_fpn_rois_lod0;
-  for (int i = 0; i < num_level; ++i) {
-    // allocate memory for each level rois
-    multi_fpn_rois[i]->Resize({num_rois_level[i], kBoxDim});
-    multi_fpn_rois_data[i] = multi_fpn_rois[i]->mutable_data<float>();
-    std::vector<uint64_t> lod0(1, 0);
-    multi_fpn_rois_lod0.push_back(lod0);
-    // statistic start point for each level rois
-    num_rois_level_integral[i + 1] =
-        num_rois_level_integral[i] + num_rois_level[i];
-  }
-  restore_index->Resize({fpn_rois_num, 1});
-  int* restore_index_data = restore_index->mutable_data<int>();
-  std::vector<int> restore_index_inter(fpn_rois_num, -1);
-  // distribute the rois into different fpn level by target level
-  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-    Tensor fpn_rois_slice =
-        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
-                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
-    const float* rois_data = fpn_rois_slice.data<float>();
-    size_t cur_offset = fpn_rois_lod[i];
-    // std::vector<size_t > lod_offset[num_level];
-    for (int j = 0; j < num_level; j++) {
-      multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
-    }
-    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
-      int lvl = target_level[cur_offset + j];
-      memcpy(multi_fpn_rois_data[lvl - min_level],
-             rois_data,
-             kBoxDim * sizeof(float));
-      multi_fpn_rois_data[lvl - min_level] += kBoxDim;
-      int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
-                             multi_fpn_rois_lod0[lvl - min_level][i + 1];
-      restore_index_inter[index_in_shuffle] = cur_offset + j;
-      multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
-      rois_data += kBoxDim;
-    }
-  }
-  for (int i = 0; i < fpn_rois_num; ++i) {
-    restore_index_data[restore_index_inter[i]] = i;
-  }
-  if (param.multi_rois_num.size() > 0) {
-    int batch_size = fpn_rois_lod.size() - 1;
-    for (int i = 0; i < num_level; ++i) {
-      param.multi_rois_num[i]->Resize({batch_size});
-      int* rois_num_data = param.multi_rois_num[i]->mutable_data<int>();
-      for (int j = 0; j < batch_size; ++j) {
-        rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
-                                            multi_fpn_rois_lod0[i][j]);
-      }
-    }
-  }
-  // merge lod information into LoDTensor
-  for (int i = 0; i < num_level; ++i) {
-    lite::LoD lod;
-    lod.emplace_back(multi_fpn_rois_lod0[i]);
-    multi_fpn_rois[i]->set_lod(lod);
-  }
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "lite/kernels/host/distribute_fpn_proposals_compute.h"
 
 REGISTER_LITE_KERNEL(distribute_fpn_proposals,
                      kARM,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::arm::DistributeFpnProposalsCompute,
+                     paddle::lite::kernels::host::DistributeFpnProposalsCompute,
                      def)
     .BindInput("FpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("RoisNum",

@@ -75,7 +75,9 @@ add_kernel(linspace_compute_host Host extra SRCS linspace_compute.cc DEPS ${lite
 add_kernel(beam_search_decode_compute_host Host extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(roi_perspective_transform_compute_host Host extra SRCS roi_perspective_transform_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(lod_reset_compute_host Host extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(argsort Host extra SRCS argsort_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(argsort_compute_host Host extra SRCS argsort_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(distribute_fpn_proposals_compute_host Host extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(collect_fpn_proposals_compute_host Host extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps})
 
 if(LITE_BUILD_EXTRA AND LITE_WITH_x86)
   lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)