diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index fab2964877a..afa3b3bec33 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -80,8 +80,8 @@ add_kernel(generate_proposals_v2_compute_arm ARM extra SRCS generate_proposals_v
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} collect_fpn_proposals_compute_host)
+add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} distribute_fpn_proposals_compute_host)
 add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pixel_shuffle_compute_arm ARM extra SRCS pixel_shuffle_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(scatter_compute_arm ARM extra SRCS scatter_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.cc b/lite/kernels/arm/collect_fpn_proposals_compute.cc
index ae2981e5410..bb609cd3058 100644
--- a/lite/kernels/arm/collect_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/collect_fpn_proposals_compute.cc
@@ -12,168 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/collect_fpn_proposals_compute.h"
-#include <numeric>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-struct ScoreWithID {
-  float score;
-  int batch_id;
-  int index;
-  int level;
-  ScoreWithID() {
-    batch_id = -1;
-    index = -1;
-    level = -1;
-  }
-  ScoreWithID(float score_, int batch_id_, int index_, int level_) {
-    score = score_;
-    batch_id = batch_id_;
-    index = index_;
-    level = level_;
-  }
-};
-
-static inline bool CompareByScore(ScoreWithID a, ScoreWithID b) {
-  return a.score >= b.score;
-}
-
-static inline bool CompareByBatchid(ScoreWithID a, ScoreWithID b) {
-  return a.batch_id < b.batch_id;
-}
-
-void CollectFpnProposalsCompute::Run() {
-  auto& param = Param<operators::CollectFpnProposalsParam>();
-  auto multi_layer_rois = param.multi_level_rois;
-  auto multi_layer_scores = param.multi_level_scores;
-  auto* fpn_rois = param.fpn_rois;
-  int post_nms_topN = param.post_nms_topN;
-
-  if (multi_layer_rois.size() != multi_layer_scores.size()) {
-    LOG(FATAL) << "multi_layer_rois.size() should be equan to "
-                  "multi_layer_scores.size()";
-  }
-
-  size_t num_fpn_level = multi_layer_rois.size();
-  std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
-  int num_size = param.multi_rois_num.size();
-  for (size_t i = 0; i < num_fpn_level; ++i) {
-    int all_rois = 0;
-    if (num_size == 0) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
-    } else {
-      const int* cur_rois_num = param.multi_rois_num[i]->data<int>();
-      all_rois = std::accumulate(
-          cur_rois_num, cur_rois_num + param.multi_rois_num[i]->numel(), 0);
-    }
-    integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
-  }
-  const int batch_size = (num_size == 0)
-                             ? multi_layer_rois[0]->lod().back().size() - 1
-                             : param.multi_rois_num[0]->numel();
-  std::vector<ScoreWithID> scores_of_all_rois(
-      integral_of_all_rois[num_fpn_level], ScoreWithID());
-  for (int i = 0; i < num_fpn_level; ++i) {
-    const float* cur_level_scores = multi_layer_scores[i]->data<float>();
-    int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-    auto cur_scores_lod = multi_layer_scores[i]->lod().back();
-    int cur_batch_id = 0;
-    int pre_num = 0;
-    for (int j = 0; j < cur_level_num; ++j) {
-      if (num_size == 0) {
-        auto cur_scores_lod = multi_layer_scores[i]->lod().back();
-        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
-        }
-      } else {
-        const int* rois_num_data = param.multi_rois_num[i]->data<int>();
-        if (j >= pre_num + rois_num_data[cur_batch_id]) {
-          pre_num += rois_num_data[cur_batch_id];
-          cur_batch_id++;
-        }
-      }
-      int cur_index = j + integral_of_all_rois[i];
-      scores_of_all_rois[cur_index].score = cur_level_scores[j];
-      scores_of_all_rois[cur_index].index = j;
-      scores_of_all_rois[cur_index].level = i;
-      scores_of_all_rois[cur_index].batch_id = cur_batch_id;
-    }
-  }
-
-  // keep top post_nms_topN rois, sort the rois by the score
-  if (post_nms_topN > integral_of_all_rois[num_fpn_level]) {
-    post_nms_topN = integral_of_all_rois[num_fpn_level];
-  }
-  std::stable_sort(
-      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByScore);
-  scores_of_all_rois.resize(post_nms_topN);
-  // sort by batch id
-  std::stable_sort(
-      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByBatchid);
-  // create a pointer array
-  std::vector<const float*> multi_fpn_rois_data(num_fpn_level);
-  for (int i = 0; i < num_fpn_level; ++i) {
-    multi_fpn_rois_data[i] = multi_layer_rois[i]->data<float>();
-  }
-
-  // initialize the outputs
-  const int kBoxDim = 4;
-  auto fpn_rois_data = fpn_rois->mutable_data<float>();
-  std::vector<uint64_t> lod0(1, 0);
-  int cur_batch_id = 0;
-  std::vector<int64_t> num_per_batch;
-  int pre_idx = 0;
-  int cur_num = 0;
-  for (int i = 0; i < post_nms_topN; ++i) {
-    int cur_fpn_level = scores_of_all_rois[i].level;
-    int cur_level_index = scores_of_all_rois[i].index;
-    std::memcpy(fpn_rois_data,
-                multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim,
-                kBoxDim * sizeof(float));
-    fpn_rois_data += kBoxDim;
-    if (scores_of_all_rois[i].batch_id != cur_batch_id) {
-      cur_batch_id = scores_of_all_rois[i].batch_id;
-      lod0.emplace_back(i);
-      cur_num = i - pre_idx;
-      pre_idx = i;
-      num_per_batch.emplace_back(cur_num);
-    }
-  }
-  num_per_batch.emplace_back(post_nms_topN - pre_idx);
-  if (param.rois_num) {
-    int* rois_num_data = param.rois_num->mutable_data<int>();
-    for (int i = 0; i < batch_size; i++) {
-      rois_num_data[i] = num_per_batch[i];
-    }
-  }
-  lod0.emplace_back(post_nms_topN);
-  lite::LoD lod;
-  lod.emplace_back(lod0);
-  fpn_rois->set_lod(lod);
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "lite/kernels/host/collect_fpn_proposals_compute.h"
 
 REGISTER_LITE_KERNEL(collect_fpn_proposals,
                      kARM,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::arm::CollectFpnProposalsCompute,
+                     paddle::lite::kernels::host::CollectFpnProposalsCompute,
                      def)
     .BindInput("MultiLevelRois", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("MultiLevelScores", {LiteType::GetTensorTy(TARGET(kARM))})
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
index eb8df0650c1..ec0e0df7fb5 100644
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
@@ -12,167 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/distribute_fpn_proposals_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-const int kBoxDim = 4;
-
-template <typename T>
-static inline T BBoxArea(const T* box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-inline std::vector<uint64_t> GetLodFromRoisNum(const Tensor* rois_num) {
-  std::vector<uint64_t> rois_lod;
-  auto* rois_num_data = rois_num->data<int>();
-
-  rois_lod.push_back(static_cast<uint64_t>(0));
-  for (int i = 0; i < rois_num->numel(); ++i) {
-    rois_lod.push_back(rois_lod.back() +
-                       static_cast<uint64_t>(rois_num_data[i]));
-  }
-  return rois_lod;
-}
-
-void DistributeFpnProposalsCompute::Run() {
-  auto& param = Param<operators::DistributeFpnProposalsParam>();
-  const lite::Tensor* fpn_rois = param.fpn_rois;
-  std::vector<lite::Tensor*> multi_fpn_rois = param.multi_fpn_rois;
-  lite::Tensor* restore_index = param.restore_index;
-  int min_level = param.min_level;
-  int max_level = param.max_level;
-  int refer_level = param.refer_level;
-  int refer_scale = param.refer_scale;
-  int num_level = max_level - min_level + 1;
-
-  std::vector<uint64_t> fpn_rois_lod;
-  int fpn_rois_num;
-  if (param.rois_num) {
-    fpn_rois_lod = GetLodFromRoisNum(param.rois_num);
-  } else {
-    fpn_rois_lod = fpn_rois->lod().back();
-  }
-  fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
-
-  std::vector<int> target_level;
-  // record the number of rois in each level
-  std::vector<int> num_rois_level(num_level, 0);
-  std::vector<int> num_rois_level_integral(num_level + 1, 0);
-  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-    auto fpn_rois_slice =
-        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
-                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
-    const float* rois_data = fpn_rois_slice.data<float>();
-    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
-      // get the target level of current rois
-      float roi_scale = std::sqrt(BBoxArea(rois_data, false));
-      int tgt_lvl =
-          std::floor(log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
-                     refer_level);
-      tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
-      target_level.push_back(tgt_lvl);
-      num_rois_level[tgt_lvl - min_level]++;
-      rois_data += kBoxDim;
-    }
-  }
-  // define the output rois
-  // pointer which point to each level fpn rois
-  std::vector<float*> multi_fpn_rois_data(num_level);
-  // lod0 which will record the offset information of each level rois
-  std::vector<std::vector<uint64_t>> multi_fpn_rois_lod0;
-  for (int i = 0; i < num_level; ++i) {
-    // allocate memory for each level rois
-    multi_fpn_rois[i]->Resize({num_rois_level[i], kBoxDim});
-    multi_fpn_rois_data[i] = multi_fpn_rois[i]->mutable_data<float>();
-    std::vector<uint64_t> lod0(1, 0);
-    multi_fpn_rois_lod0.push_back(lod0);
-    // statistic start point for each level rois
-    num_rois_level_integral[i + 1] =
-        num_rois_level_integral[i] + num_rois_level[i];
-  }
-  restore_index->Resize({fpn_rois_num, 1});
-  int* restore_index_data = restore_index->mutable_data<int>();
-  std::vector<int> restore_index_inter(fpn_rois_num, -1);
-  // distribute the rois into different fpn level by target level
-  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-    Tensor fpn_rois_slice =
-        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
-                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
-    const float* rois_data = fpn_rois_slice.data<float>();
-    size_t cur_offset = fpn_rois_lod[i];
-    // std::vector<size_t > lod_offset[num_level];
-    for (int j = 0; j < num_level; j++) {
-      multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
-    }
-    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
-      int lvl = target_level[cur_offset + j];
-      memcpy(multi_fpn_rois_data[lvl - min_level],
-             rois_data,
-             kBoxDim * sizeof(float));
-      multi_fpn_rois_data[lvl - min_level] += kBoxDim;
-      int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
-                             multi_fpn_rois_lod0[lvl - min_level][i + 1];
-      restore_index_inter[index_in_shuffle] = cur_offset + j;
-      multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
-      rois_data += kBoxDim;
-    }
-  }
-  for (int i = 0; i < fpn_rois_num; ++i) {
-    restore_index_data[restore_index_inter[i]] = i;
-  }
-  if (param.multi_rois_num.size() > 0) {
-    int batch_size = fpn_rois_lod.size() - 1;
-    for (int i = 0; i < num_level; ++i) {
-      param.multi_rois_num[i]->Resize({batch_size});
-      int* rois_num_data = param.multi_rois_num[i]->mutable_data<int>();
-      for (int j = 0; j < batch_size; ++j) {
-        rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
-                                            multi_fpn_rois_lod0[i][j]);
-      }
-    }
-  }
-  // merge lod information into LoDTensor
-  for (int i = 0; i < num_level; ++i) {
-    lite::LoD lod;
-    lod.emplace_back(multi_fpn_rois_lod0[i]);
-    multi_fpn_rois[i]->set_lod(lod);
-  }
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "lite/kernels/host/distribute_fpn_proposals_compute.h"
 
 REGISTER_LITE_KERNEL(distribute_fpn_proposals,
                      kARM,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::arm::DistributeFpnProposalsCompute,
+                     paddle::lite::kernels::host::DistributeFpnProposalsCompute,
                      def)
     .BindInput("FpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("RoisNum",
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 7f861dcfb00..16d6d90ce22 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -75,7 +75,9 @@ add_kernel(linspace_compute_host Host extra SRCS linspace_compute.cc DEPS ${lite
 add_kernel(beam_search_decode_compute_host Host extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(roi_perspective_transform_compute_host Host extra SRCS roi_perspective_transform_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(lod_reset_compute_host Host extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(argsort Host extra SRCS argsort_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(argsort_compute_host Host extra SRCS argsort_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(distribute_fpn_proposals_compute_host Host extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(collect_fpn_proposals_compute_host Host extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps})
 
 if(LITE_BUILD_EXTRA AND LITE_WITH_x86)
   lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)
diff --git a/lite/kernels/host/collect_fpn_proposals_compute.cc b/lite/kernels/host/collect_fpn_proposals_compute.cc
new file mode 100644
index 00000000000..d13f53799af
--- /dev/null
+++ b/lite/kernels/host/collect_fpn_proposals_compute.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/collect_fpn_proposals_compute.h"
+#include <numeric>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+struct ScoreWithID {
+  float score;
+  int batch_id;
+  int index;
+  int level;
+  ScoreWithID() {
+    batch_id = -1;
+    index = -1;
+    level = -1;
+  }
+  ScoreWithID(float score_, int batch_id_, int index_, int level_) {
+    score = score_;
+    batch_id = batch_id_;
+    index = index_;
+    level = level_;
+  }
+};
+
+static inline bool CompareByScore(ScoreWithID a, ScoreWithID b) {
+  return a.score >= b.score;
+}
+
+static inline bool CompareByBatchid(ScoreWithID a, ScoreWithID b) {
+  return a.batch_id < b.batch_id;
+}
+
+void CollectFpnProposalsCompute::Run() {
+  auto& param = Param<operators::CollectFpnProposalsParam>();
+  auto multi_layer_rois = param.multi_level_rois;
+  auto multi_layer_scores = param.multi_level_scores;
+  auto* fpn_rois = param.fpn_rois;
+  int post_nms_topN = param.post_nms_topN;
+
+  if (multi_layer_rois.size() != multi_layer_scores.size()) {
+    LOG(FATAL) << "multi_layer_rois.size() should be equan to "
+                  "multi_layer_scores.size()";
+  }
+
+  size_t num_fpn_level = multi_layer_rois.size();
+  std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
+  int num_size = param.multi_rois_num.size();
+  for (size_t i = 0; i < num_fpn_level; ++i) {
+    int all_rois = 0;
+    if (num_size == 0) {
+      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+      all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
+    } else {
+      const int* cur_rois_num = param.multi_rois_num[i]->data<int>();
+      all_rois = std::accumulate(
+          cur_rois_num, cur_rois_num + param.multi_rois_num[i]->numel(), 0);
+    }
+    integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
+  }
+  const int batch_size = (num_size == 0)
+                             ? multi_layer_rois[0]->lod().back().size() - 1
+                             : param.multi_rois_num[0]->numel();
+  std::vector<ScoreWithID> scores_of_all_rois(
+      integral_of_all_rois[num_fpn_level], ScoreWithID());
+  for (int i = 0; i < num_fpn_level; ++i) {
+    const float* cur_level_scores = multi_layer_scores[i]->data<float>();
+    int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
+    auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+    int cur_batch_id = 0;
+    int pre_num = 0;
+    for (int j = 0; j < cur_level_num; ++j) {
+      if (num_size == 0) {
+        auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
+          cur_batch_id++;
+        }
+      } else {
+        const int* rois_num_data = param.multi_rois_num[i]->data<int>();
+        if (j >= pre_num + rois_num_data[cur_batch_id]) {
+          pre_num += rois_num_data[cur_batch_id];
+          cur_batch_id++;
+        }
+      }
+      int cur_index = j + integral_of_all_rois[i];
+      scores_of_all_rois[cur_index].score = cur_level_scores[j];
+      scores_of_all_rois[cur_index].index = j;
+      scores_of_all_rois[cur_index].level = i;
+      scores_of_all_rois[cur_index].batch_id = cur_batch_id;
+    }
+  }
+
+  // keep top post_nms_topN rois, sort the rois by the score
+  if (post_nms_topN > integral_of_all_rois[num_fpn_level]) {
+    post_nms_topN = integral_of_all_rois[num_fpn_level];
+  }
+  std::stable_sort(
+      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByScore);
+  scores_of_all_rois.resize(post_nms_topN);
+  // sort by batch id
+  std::stable_sort(
+      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByBatchid);
+  // create a pointer array
+  std::vector<const float*> multi_fpn_rois_data(num_fpn_level);
+  for (int i = 0; i < num_fpn_level; ++i) {
+    multi_fpn_rois_data[i] = multi_layer_rois[i]->data<float>();
+  }
+
+  // initialize the outputs
+  const int kBoxDim = 4;
+  auto fpn_rois_data = fpn_rois->mutable_data<float>();
+  std::vector<uint64_t> lod0(1, 0);
+  int cur_batch_id = 0;
+  std::vector<int64_t> num_per_batch;
+  int pre_idx = 0;
+  int cur_num = 0;
+  for (int i = 0; i < post_nms_topN; ++i) {
+    int cur_fpn_level = scores_of_all_rois[i].level;
+    int cur_level_index = scores_of_all_rois[i].index;
+    std::memcpy(fpn_rois_data,
+                multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim,
+                kBoxDim * sizeof(float));
+    fpn_rois_data += kBoxDim;
+    if (scores_of_all_rois[i].batch_id != cur_batch_id) {
+      cur_batch_id = scores_of_all_rois[i].batch_id;
+      lod0.emplace_back(i);
+      cur_num = i - pre_idx;
+      pre_idx = i;
+      num_per_batch.emplace_back(cur_num);
+    }
+  }
+  num_per_batch.emplace_back(post_nms_topN - pre_idx);
+  if (param.rois_num) {
+    int* rois_num_data = param.rois_num->mutable_data<int>();
+    for (int i = 0; i < batch_size; i++) {
+      rois_num_data[i] = num_per_batch[i];
+    }
+  }
+  lod0.emplace_back(post_nms_topN);
+  lite::LoD lod;
+  lod.emplace_back(lod0);
+  fpn_rois->set_lod(lod);
+  return;
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(collect_fpn_proposals,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::CollectFpnProposalsCompute,
+                     def)
+    .BindInput("MultiLevelRois", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("MultiLevelScores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("RoisNum", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("FpnRois", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("MultiLevelRoIsNum", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindPaddleOpVersion("collect_fpn_proposals", 1)
+    .Finalize();
diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.h b/lite/kernels/host/collect_fpn_proposals_compute.h
similarity index 86%
rename from lite/kernels/arm/collect_fpn_proposals_compute.h
rename to lite/kernels/host/collect_fpn_proposals_compute.h
index f1e7448a07a..f00b312dc20 100644
--- a/lite/kernels/arm/collect_fpn_proposals_compute.h
+++ b/lite/kernels/host/collect_fpn_proposals_compute.h
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include <algorithm>
 #include "lite/core/kernel.h"
-#include "lite/operators/axpy_op.h"
+#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 class CollectFpnProposalsCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
  public:
   using param_t = operators::CollectFpnProposalsParam;
 
@@ -32,7 +31,7 @@ class CollectFpnProposalsCompute
   virtual ~CollectFpnProposalsCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/host/distribute_fpn_proposals_compute.cc b/lite/kernels/host/distribute_fpn_proposals_compute.cc
new file mode 100644
index 00000000000..bc8e4dfe9db
--- /dev/null
+++ b/lite/kernels/host/distribute_fpn_proposals_compute.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/distribute_fpn_proposals_compute.h"
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+const int kBoxDim = 4;
+
+template <typename T>
+static inline T BBoxArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+inline std::vector<uint64_t> GetLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<uint64_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+
+  rois_lod.push_back(static_cast<uint64_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() +
+                       static_cast<uint64_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
+void DistributeFpnProposalsCompute::Run() {
+  auto& param = Param<operators::DistributeFpnProposalsParam>();
+  const lite::Tensor* fpn_rois = param.fpn_rois;
+  std::vector<lite::Tensor*> multi_fpn_rois = param.multi_fpn_rois;
+  lite::Tensor* restore_index = param.restore_index;
+  int min_level = param.min_level;
+  int max_level = param.max_level;
+  int refer_level = param.refer_level;
+  int refer_scale = param.refer_scale;
+  int num_level = max_level - min_level + 1;
+
+  std::vector<uint64_t> fpn_rois_lod;
+  int fpn_rois_num;
+  if (param.rois_num) {
+    fpn_rois_lod = GetLodFromRoisNum(param.rois_num);
+  } else {
+    fpn_rois_lod = fpn_rois->lod().back();
+  }
+  fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+
+  std::vector<int> target_level;
+  // record the number of rois in each level
+  std::vector<int> num_rois_level(num_level, 0);
+  std::vector<int> num_rois_level_integral(num_level + 1, 0);
+  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+    auto fpn_rois_slice =
+        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
+                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
+    const float* rois_data = fpn_rois_slice.data<float>();
+    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+      // get the target level of current rois
+      float roi_scale = std::sqrt(BBoxArea(rois_data, false));
+      int tgt_lvl =
+          std::floor(log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
+                     refer_level);
+      tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
+      target_level.push_back(tgt_lvl);
+      num_rois_level[tgt_lvl - min_level]++;
+      rois_data += kBoxDim;
+    }
+  }
+  // define the output rois
+  // pointer which point to each level fpn rois
+  std::vector<float*> multi_fpn_rois_data(num_level);
+  // lod0 which will record the offset information of each level rois
+  std::vector<std::vector<uint64_t>> multi_fpn_rois_lod0;
+  for (int i = 0; i < num_level; ++i) {
+    // allocate memory for each level rois
+    multi_fpn_rois[i]->Resize({num_rois_level[i], kBoxDim});
+    multi_fpn_rois_data[i] = multi_fpn_rois[i]->mutable_data<float>();
+    std::vector<uint64_t> lod0(1, 0);
+    multi_fpn_rois_lod0.push_back(lod0);
+    // statistic start point for each level rois
+    num_rois_level_integral[i + 1] =
+        num_rois_level_integral[i] + num_rois_level[i];
+  }
+  restore_index->Resize({fpn_rois_num, 1});
+  int* restore_index_data = restore_index->mutable_data<int>();
+  std::vector<int> restore_index_inter(fpn_rois_num, -1);
+  // distribute the rois into different fpn level by target level
+  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+    Tensor fpn_rois_slice =
+        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
+                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
+    const float* rois_data = fpn_rois_slice.data<float>();
+    size_t cur_offset = fpn_rois_lod[i];
+    for (int j = 0; j < num_level; j++) {
+      multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
+    }
+    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+      int lvl = target_level[cur_offset + j];
+      memcpy(multi_fpn_rois_data[lvl - min_level],
+             rois_data,
+             kBoxDim * sizeof(float));
+      multi_fpn_rois_data[lvl - min_level] += kBoxDim;
+      int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
+                             multi_fpn_rois_lod0[lvl - min_level][i + 1];
+      restore_index_inter[index_in_shuffle] = cur_offset + j;
+      multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
+      rois_data += kBoxDim;
+    }
+  }
+  for (int i = 0; i < fpn_rois_num; ++i) {
+    restore_index_data[restore_index_inter[i]] = i;
+  }
+  if (param.multi_rois_num.size() > 0) {
+    int batch_size = fpn_rois_lod.size() - 1;
+    for (int i = 0; i < num_level; ++i) {
+      param.multi_rois_num[i]->Resize({batch_size});
+      int* rois_num_data = param.multi_rois_num[i]->mutable_data<int>();
+      for (int j = 0; j < batch_size; ++j) {
+        rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
+                                            multi_fpn_rois_lod0[i][j]);
+      }
+    }
+  }
+  // merge lod information into LoDTensor
+  for (int i = 0; i < num_level; ++i) {
+    lite::LoD lod;
+    lod.emplace_back(multi_fpn_rois_lod0[i]);
+    multi_fpn_rois[i]->set_lod(lod);
+  }
+  return;
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(distribute_fpn_proposals,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::DistributeFpnProposalsCompute,
+                     def)
+    .BindInput("FpnRois", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("RoisNum",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("MultiFpnRois", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("MultiLevelRoIsNum",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("RestoreIndex",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindPaddleOpVersion("distribute_fpn_proposals", 1)
+    .Finalize();
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.h b/lite/kernels/host/distribute_fpn_proposals_compute.h
similarity index 85%
rename from lite/kernels/arm/distribute_fpn_proposals_compute.h
rename to lite/kernels/host/distribute_fpn_proposals_compute.h
index e150b338de0..88e99674ee2 100644
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.h
+++ b/lite/kernels/host/distribute_fpn_proposals_compute.h
@@ -13,17 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include <algorithm>
 #include "lite/core/kernel.h"
-#include "lite/operators/distribute_fpn_proposals_op.h"
+#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 class DistributeFpnProposalsCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
  public:
   using param_t = operators::DistributeFpnProposalsParam;
 
@@ -32,7 +31,7 @@ class DistributeFpnProposalsCompute
   virtual ~DistributeFpnProposalsCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle