PaddlePaddle
diff --git a/‎lite/api/cxx_api.cc‎
Lines changed: 5 additions & 0 deletions b/‎lite/api/cxx_api.cc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lite/api/cxx_api.h‎
Lines changed: 38 additions & 0 deletions b/‎lite/api/cxx_api.h‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎lite/api/cxx_api_impl.cc‎
Lines changed: 7 additions & 0 deletions b/‎lite/api/cxx_api_impl.cc‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎lite/api/paddle_api.cc‎
Lines changed: 55 additions & 9 deletions b/‎lite/api/paddle_api.cc‎
Lines changed: 55 additions & 9 deletions
diff --git a/‎lite/api/paddle_api.h‎
Lines changed: 6 additions & 0 deletions b/‎lite/api/paddle_api.h‎
Lines changed: 6 additions & 0 deletions
@@ -182,6 +182,11 @@ void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
 
 #if !defined(LITE_WITH_METAL)
 lite::Tensor *Predictor::GetInput(size_t offset) {
+#ifdef LITE_WITH_XPU
+  XPU_CALL(xpu_set_device(reinterpret_cast<lite::XPURunTimeOption *>(
+                              runtime_options_[TARGET(kXPU)].get())
+                              ->xpu_dev_num));
+#endif
   CHECK(input_names_.size() > offset)
       << "The network has " << input_names_.size() << " inputs"
       << ", the offset should be less than this.";
 
@@ -164,6 +164,16 @@ class LITE_API Predictor {
     CheckInputValid();
 
 #ifdef LITE_WITH_XPU
+    if (lite::TargetWrapperXPU::xpu_runtime_ptr !=
+        runtime_options_[TARGET(kXPU)].get()) {
+      lite::TargetWrapperXPU::xpu_runtime_ptr =
+          reinterpret_cast<lite::XPURunTimeOption*>(
+              runtime_options_[TARGET(kXPU)].get());
+      // thanks to rumtime context is thread_local,so we should set device when
+      // using different predictor in the same thread.
+      XPU_CALL(
+          xpu_set_device(lite::TargetWrapperXPU::xpu_runtime_ptr->xpu_dev_num));
+    }
     std::vector<std::vector<int64_t>> query_shape;
     for (size_t i = 0; i < input_names_.size(); i++) {
       query_shape.push_back(std::vector<int64_t>(GetInput(i)->dims().data()));
@@ -237,6 +247,29 @@ class LITE_API Predictor {
   void CheckPaddleOpVersions(
       const std::shared_ptr<cpp::ProgramDesc>& program_desc);
 
+  void SetRunTimeOption(const lite_api::CxxConfig& config) {
+    auto&& map_runtime_options = config.runtime_options();
+#ifdef LITE_WITH_XPU
+    std::shared_ptr<void> runtime_option =
+        std::shared_ptr<lite::XPURunTimeOption>(new lite::XPURunTimeOption);
+    runtime_options_.emplace(TARGET(kXPU), std::move(runtime_option));
+    if (map_runtime_options[TARGET(kXPU)].get()) {
+      reinterpret_cast<lite::XPURunTimeOption*>(
+          runtime_options_[TARGET(kXPU)].get())
+          ->Set(reinterpret_cast<const lite::XPURunTimeOption*>(
+              map_runtime_options[TARGET(kXPU)].get()));
+    }
+#endif
+  }
+
+#ifdef LITE_WITH_XPU
+  void SetXPUStream(void* stream) {
+    reinterpret_cast<lite::XPURunTimeOption*>(
+        runtime_options_[TARGET(kXPU)].get())
+        ->xpu_stream.SetXPUStream(stream);
+  }
+#endif
+
   // #ifdef LITE_WITH_TRAIN
   //   void Run(const std::vector<framework::Tensor>& tensors) {
   //     FeedVars(tensors);
@@ -257,6 +290,8 @@ class LITE_API Predictor {
 #endif
 
  private:
+  std::map<TargetType, std::shared_ptr<void>> runtime_options_;
+
   std::shared_ptr<cpp::ProgramDesc> program_desc_;
   std::shared_ptr<Scope> scope_;
   Scope* exec_scope_;
@@ -323,6 +358,9 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
       const std::string& model_dir,
       lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
       bool record_info = false) override;
+#ifdef LITE_WITH_XPU
+  void SetXPUStream(void* stream);
+#endif
 
  private:
   std::shared_ptr<Predictor> raw_predictor_;
 
@@ -44,6 +44,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   config_ = config;
   mode_ = config.power_mode();
   threads_ = config.threads();
+  raw_predictor_->SetRunTimeOption(config);
 #ifdef LITE_USE_THREAD_POOL
   int thread_num = ThreadPool::Init(threads_);
   if (thread_num > 1) {
@@ -278,6 +279,12 @@ bool CxxPaddleApiImpl::TryShrinkMemory() {
   return raw_predictor_->TryShrinkMemory();
 }
 
+#ifdef LITE_WITH_XPU
+void CxxPaddleApiImpl::SetXPUStream(void *stream) {
+  raw_predictor_->SetXPUStream(stream);
+}
+#endif
+
 }  // namespace lite
 
 namespace lite_api {
 
@@ -24,6 +24,7 @@
 #ifdef LITE_WITH_XPU
 #include <functional>
 #include <mutex>  // NOLINT
+#include "lite/backends/xpu/runtime_option.h"
 #include "lite/backends/xpu/target_wrapper.h"
 #endif
 
@@ -266,6 +267,11 @@ ConfigBase::ConfigBase(PowerMode mode, int threads) {
   mode_ = lite::DeviceInfo::Global().mode();
   threads_ = lite::DeviceInfo::Global().threads();
 #endif
+#ifdef LITE_WITH_XPU
+  std::shared_ptr<void> runtime_option =
+      std::shared_ptr<lite::XPURunTimeOption>(new lite::XPURunTimeOption);
+  runtime_options_.emplace(TARGET(kXPU), std::move(runtime_option));
+#endif
 }
 
 void ConfigBase::set_opencl_binary_path_name(const std::string &path,
@@ -478,10 +484,14 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
       CHECK(lite::TargetWrapperXPU::shared_l3_size >= l3_size)
           << "Enlarge XPU Shared L3 Cache Is Not Allowed.";
     }
-    lite::TargetWrapperXPU::local_l3_size = 0;
+    reinterpret_cast<lite::XPURunTimeOption *>(
+        runtime_options()[TARGET(kXPU)].get())
+        ->xpu_local_l3_size = 0;
     lite::TargetWrapperXPU::need_l3_mutex = true;
   } else {
-    lite::TargetWrapperXPU::local_l3_size = l3_size;
+    reinterpret_cast<lite::XPURunTimeOption *>(
+        runtime_options()[TARGET(kXPU)].get())
+        ->xpu_local_l3_size = l3_size;
     lite::TargetWrapperXPU::need_l3_mutex = false;
   }
 #else
@@ -493,17 +503,21 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
 
 void CxxConfig::set_xpu_l3_cache_autotune(bool autotune) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::local_l3_autotune = autotune;
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_local_l3_autotune = autotune;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_l3_cache_autotune' is ignored, please "
                   "rebuild it with LITE_WITH_XPU=ON.";
 #endif
 }
 
-void set_xpu_gm_workspace_method(size_t gm_size) {
+void CxxConfig::set_xpu_gm_workspace_method(size_t gm_size) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::local_gm_size = gm_size;
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_local_gm_size = gm_size;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_gm_workspace_method' is ignored, please "
@@ -513,7 +527,9 @@ void set_xpu_gm_workspace_method(size_t gm_size) {
 
 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::SetDev(dev_no);
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_dev_num = dev_no;
 #else
   LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
                   "ignored, please rebuild it with LITE_WITH_XPU=ON.";
@@ -522,7 +538,9 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 
 void CxxConfig::enable_xpu_multi_stream() {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::enable_xpu_multi_stream();
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_enable_multi_stream = true;
 #else
   LOG(WARNING)
       << "The invoking of the function 'enable_xpu_stream_per_thread' is "
@@ -591,7 +609,9 @@ void CxxConfig::set_xpu_conv_autotune(bool autotune,
 
 void CxxConfig::set_xpu_cluster_num(const int num) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::cluster_num = num;
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_cluster_num = num;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_cluster_num' is ignored, please "
@@ -601,14 +621,40 @@ void CxxConfig::set_xpu_cluster_num(const int num) {
 
 void CxxConfig::set_xpu_sdnn_num(const int num) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::sdnn_num = num;
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_sdnn_num = num;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_sdnn_num' is ignored, please "
                   "rebuild it with LITE_WITH_XPU=ON.";
 #endif
 }
 
+void CxxConfig::set_xpu_dump_tensor_path(const std::string dump_tensor_path) {
+#ifdef LITE_WITH_XPU
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_dump_tensor_path = dump_tensor_path;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_dump_tensor_path' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
+void CxxConfig::set_xpu_dump_log_path(const std::string dump_log_path) {
+#ifdef LITE_WITH_XPU
+  reinterpret_cast<lite::XPURunTimeOption *>(
+      runtime_options()[TARGET(kXPU)].get())
+      ->xpu_dump_log_path = dump_log_path;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_dump_log_path' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 template <class T>
 void CxxConfig::set_preferred_inputs_for_warmup(const int group_idx,
                                                 const int tensor_idx,
 
@@ -185,6 +185,7 @@ class LITE_API ConfigBase {
   bool metal_use_memory_reuse_{false};
 
   std::vector<std::string> discarded_passes_{};
+  std::map<TargetType, std::shared_ptr<void>> runtime_options_;
 
  public:
   explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
@@ -346,6 +347,9 @@ class LITE_API ConfigBase {
   const std::vector<std::string> get_discarded_passes() const {
     return discarded_passes_;
   }
+  std::map<TargetType, std::shared_ptr<void>> runtime_options() const {
+    return runtime_options_;
+  }
 };
 
 class LITE_API CxxModelBuffer {
@@ -455,6 +459,8 @@ class LITE_API CxxConfig : public ConfigBase {
   void set_xpu_sdnn_num(const int num);
   void set_xpu_local_quant(bool local_quant = false);
   void set_xpu_compute_precision(const std::string& precision = "int16");
+  void set_xpu_dump_tensor_path(const std::string dump_tensor_path = "");
+  void set_xpu_dump_log_path(const std::string dump_log_path = "");
 
   // set input tensor for warmup.
   // It is optional. If you set prefered_inputs, model wil run immediately when