PaddlePaddle · wanghuancoder · Jan 18, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.cc
@@ -293,8 +293,9 @@ OneDNNPhiKernelInstruction::OneDNNPhiKernelInstruction(
             .AsVector();
 
     for (auto& attr : data_format_tensors_attr) {
-      auto pair = kernel_context_.InputRangeAt(value_exec_info_->GetIdByName(
-          attr.dyn_cast<pir::StrAttribute>().AsString()));
+      auto pair =
+          kernel_context_.InputRangeAt(yaml_info_parser.InputName2Id().at(
+              attr.dyn_cast<pir::StrAttribute>().AsString()));
       for (int i = pair.first; i < pair.second; ++i) {
         data_format_tensors_.insert(i);
       }

diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_instruction.h
@@ -52,7 +52,7 @@ class OneDNNPhiKernelInstruction : public InstructionBase {
 
   const std::string& Name() const override { return phi_op_name_; }
 
- private:
+ protected:
   paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{
       nullptr};  // not owned
 

diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.cc
@@ -48,13 +48,73 @@ OneDNNMixedPhiKernelInstruction::OneDNNMixedPhiKernelInstruction(
     const platform::Place& place,
     pir::Operation* op,
     const ValueExecutionInfo* value_exec_info)
-    : OneDNNPhiKernelInstruction(id, place, op, value_exec_info) {}
+    : OneDNNPhiKernelInstruction(id, place, op, value_exec_info) {
+  auto op_attributes = op->attributes();
+  kernel_name_ =
+      op_attributes.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
+  kernel_key_ = op_attributes.at("kernel_key")
+                    .dyn_cast<paddle::dialect::KernelAttribute>()
+                    .data();
+}
 
 void OneDNNMixedPhiKernelInstruction::Run() {
   // Step1. Mixed Dynamic Choose Kernel
-  // todo if (input_tensor.layout() != phi::DataLayout::ONEDNN)
+  if (!has_choose_kernel_) {
+    has_choose_kernel_ = true;
+    use_onednn_kernel_ =
+        phi_kernel_->check_if_onednn_kernel_support_(&kernel_context_);
+    if (!use_onednn_kernel_) {
+      auto kernel_result =
+          phi::KernelFactory::Instance().SelectKernelOrThrowError(kernel_name_,
+                                                                  kernel_key_);
+      delete phi_kernel_;
+      phi_kernel_ = new phi::Kernel(kernel_result.kernel);
+    }
+  }
+
+  // Step2. Run Kernel
+  if (use_onednn_kernel_) {
+    OneDNNPhiKernelInstruction::Run();
+  } else {
+    // TransLayout first
+    auto inputs = kernel_context_.InputsBetween<phi::DenseTensor>(
+        size_t(0), kernel_context_.InputsSize());
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto input = inputs[i];
+      if (input->layout() == phi::DataLayout::ONEDNN) {
+        DataLayout tmp_layout =
+            phi::OneDNNContext::tls().get_cur_paddle_data_layout();
+
+        // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
+        // data_transfer.cc
+        if (!input->IsInitialized() && tmp_layout == DataLayout::NHWC) {
+          auto transed_tensor = const_cast<phi::DenseTensor*>(input);
+          transed_tensor->set_layout(tmp_layout);
+          phi::funcs::MatchShapeToLayout(
+              transed_tensor, phi::DataLayout::ONEDNN, tmp_layout);
+        } else {
+          phi::DenseTensor transed_tensor;
+          transed_tensor.set_meta(input->meta());
+          phi::funcs::TransDataLayoutFromOneDNN(phi::DataLayout::ONEDNN,
+                                                tmp_layout,
+                                                *input,
+                                                &transed_tensor,
+                                                phi::CPUPlace());
+          *(const_cast<phi::DenseTensor*>(input)) = transed_tensor;
+        }
+      }
+    }
 
-  OneDNNPhiKernelInstruction::Run();
+    VLOG(6) << "Begin run op " << phi_op_name_ << " infer meta.";
+    if (infer_meta_interface_) {
+      infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+    }
+    VLOG(6) << "End run op " << phi_op_name_ << " infer meta.";
+    VLOG(6) << "Begin run op " << phi_op_name_ << " kernel.";
+    (*(phi_kernel_))(&(kernel_context_));
+    VLOG(6) << "End run op " << phi_op_name_ << " kernel.";
+  }
 }
 
 }  // namespace framework

diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.h b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_mixed_instruction.h
@@ -33,6 +33,12 @@ class OneDNNMixedPhiKernelInstruction : public OneDNNPhiKernelInstruction {
                                   const ValueExecutionInfo* value_exec_info);
 
   void Run() override;
+
+ private:
+  std::string kernel_name_;
+  phi::KernelKey kernel_key_;
+  bool has_choose_kernel_{false};
+  bool use_onednn_kernel_{true};
 };
 
 }  // namespace framework

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -588,6 +588,21 @@ void PirInterpreter::UpdateNcclOpNum() {
   VLOG(4) << "Update nccl op num, nccl op num is: " << nccl_op_num;
 }
 
+void PirInterpreter::UpdateOneDNNOpNum() {
+  int64_t onednn_op_num = 0;
+#ifdef PADDLE_WITH_DNNL
+  for (auto& ins : vec_instruction_base_) {
+    if (dynamic_cast<OneDNNPhiKernelInstruction*>(ins.get()) != nullptr ||
+        dynamic_cast<OneDNNLegacyKernelInstruction*>(ins.get()) != nullptr ||
+        dynamic_cast<OneDNNMixedPhiKernelInstruction*>(ins.get()) != nullptr) {
+      onednn_op_num = onednn_op_num + 1;
+    }
+  }
+#endif
+  onednn_op_num_ = onednn_op_num;
+  VLOG(4) << "Update onednn op num, onednn op num is: " << onednn_op_num;
+}
+
 // Note(zhangbo):
 // When there is a KQueueSync type OP in the model, breadth traversal is better
 // than depth traversal. For example: OP(O) ->(direct_run)-> OP(A)
@@ -1305,7 +1320,7 @@ paddle::framework::FetchList PirInterpreter::Run(
 
     // Run
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
-        execution_config_.used_for_inference ||
+        onednn_op_num_ || execution_config_.used_for_inference ||
         ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
          (sync_op_num_ == 0))) {
       LOG_FIRST_N(INFO, 1) << "pir interpreter is running by trace mode ...";
@@ -1326,7 +1341,7 @@ paddle::framework::FetchList PirInterpreter::Run(
     }
 #endif
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
-        execution_config_.used_for_inference ||
+        onednn_op_num_ || execution_config_.used_for_inference ||
         ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
          (sync_op_num_ == 0))) {
       TraceRunImpl();
@@ -1395,7 +1410,7 @@ FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
 
     // Run
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
-        execution_config_.used_for_inference ||
+        onednn_op_num_ || execution_config_.used_for_inference ||
         ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
          (sync_op_num_ == 0))) {
       LOG_FIRST_N(INFO, 1) << "pir interpreter is running by trace mode ...";
@@ -1416,7 +1431,7 @@ FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
     }
 #endif
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
-        execution_config_.used_for_inference ||
+        onednn_op_num_ || execution_config_.used_for_inference ||
         ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
          (sync_op_num_ == 0))) {
       TraceRunImpl();
@@ -1804,6 +1819,9 @@ void PirInterpreter::PreAnalysis() {
 
   UpdateNcclOpNum();
   VLOG(4) << "Done UpdateNcclOpNum";
+
+  UpdateOneDNNOpNum();
+  VLOG(4) << "Done UpdateOneDNNOpNum";
 }
 
 ::pir::Value PirInterpreter::GetValueByName(const std::string& var_name) {

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -124,6 +124,7 @@ class PirInterpreter : public InterpreterBaseImpl {
   // build graph
   void UpdateSyncOpNum();
   void UpdateNcclOpNum();
+  void UpdateOneDNNOpNum();
   void AnalyseExecuteOrderForTrace(
       std::map<size_t, std::set<size_t>> op_downstream_map,
       InstructionSchedulingPriorityLess compare);
@@ -196,6 +197,7 @@ class PirInterpreter : public InterpreterBaseImpl {
   // used for Trace
   int64_t sync_op_num_{-1};
   int64_t nccl_op_num_{-1};
+  int64_t onednn_op_num_{-1};
   std::vector<size_t> trace_execute_order_;
 
   std::vector<HookFunc> output_hookfuncs_;

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -833,6 +833,24 @@ bool AnalysisPredictor::PrepareExecutor() {
           gpu_pm.EnableIRPrinting();
         }
         gpu_pm.Run(pir_program_.get());
+      } else {
+        ::pir::PassManager cpu_pm(::pir::IrContext::Instance(), 2);
+
+        auto constant_folding_pass = ::pir::CreateConstantFoldingPass();
+        constant_folding_pass->SetNotOwned(pir::kPlaceAttr, &place_);
+        constant_folding_pass->SetNotOwned(pir::kParamScopeAttr, sub_scope_);
+
+        cpu_pm.AddPass(std::move(constant_folding_pass));
+        cpu_pm.AddPass(::pir::CreateDeadCodeEliminationPass());
+        cpu_pm.AddPass(::pir::CreateReplaceFetchWithShadowOutputPass());
+        //----------------------------------------------------------------------------------------------//
+        if (!config_.glog_info_disabled()) {
+          cpu_pm.EnablePrintStatistics();
+        }
+        if (config_.ir_debug_) {
+          cpu_pm.EnableIRPrinting();
+        }
+        cpu_pm.Run(pir_program_.get());
       }
 
       pir_program_ = std::move(

diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -15,19 +15,17 @@
   extra_args : bool is_test=false
   data_format_tensors : x, out, mid_out, out_grad
 
+- op : pad3d
+  extra_args :
+  data_format_tensors : x
+  dynamic_fallback : True
+
 # - op : matmul
 #   extra_args : str mkldnn_data_type="float32"
 #   layout_transform :
 #     arg_name: cur_paddle_data_layout
 #     tensors: x, y
 
-# - op : pad3d
-#   extra_args :
-#   layout_transform :
-#     arg_name: data_format
-#     tensors: x
-#   dynamic_fallback : True
-
 # - op : batch_norm
 #   extra_args : bool fuse_with_relu=false
 #   layout_transform :

diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -1345,7 +1345,7 @@ void HandleForSpecialOp(
     }
   }
 
-  if (op_item->isa<::pir::YieldOp>() || op_item->isa<::pir::ShadowOutputOp>()) {
+  if (op_item->isa<::pir::YieldOp>()) {
     if (op_item->num_operands() > 0) {
       for (size_t i = 0; i < op_item->num_operands(); ++i) {
         auto cur_in = op_item->operand_source(i);
@@ -1360,6 +1360,32 @@ void HandleForSpecialOp(
     }
   }
 
+  if (op_item->isa<::pir::ShadowOutputOp>()) {
+    if (op_item->num_operands() > 0) {
+      for (size_t i = 0; i < op_item->num_operands(); ++i) {
+        auto cur_in = op_item->operand_source(i);
+        if (!cur_in) {
+          vec_inputs.emplace_back();
+          continue;
+        }
+        auto new_in = GetNewInput(
+            cur_in, *map_value_pair, static_cast<int>(i), op_item->name());
+        // layout transfer(only for onednn)
+#ifdef PADDLE_WITH_DNNL
+        auto new_in_type = new_in.type();
+        if (new_in_type.isa<AllocatedDenseTensorType>()) {
+          if (new_in_type.dyn_cast<AllocatedDenseTensorType>().data_layout() ==
+              phi::DataLayout::ONEDNN) {
+            new_in = AddOneDNN2PaddleLayoutTransferOp(
+                new_in, phi::DataLayout::ANY, block);
+          }
+        }
+#endif
+        vec_inputs.push_back(new_in);
+      }
+    }
+  }
+
   if (op_item->isa<::pir::SetParameterOp>()) {
     if (op_item->num_operands() > 0) {
       for (size_t i = 0; i < op_item->num_operands(); ++i) {

@@ -282,6 +282,8 @@ class Kernel {
   }
 
   GetKernelTypeForVarFn get_kerneltype_forvar_fn_{nullptr};
+  std::function<bool(const KernelContext* ctx)> check_if_onednn_kernel_support_{
+      nullptr};
 
  private:
   KernelFn fn_{nullptr};

diff --git a/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc b/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
@@ -62,14 +62,13 @@ void OneDNN2PaddleLayout(const Context& dev_ctx,
   }
 
   DataLayout tmp_layout = static_cast<DataLayout>(dst_layout);
-  if (static_cast<DataLayout>(dst_layout) == DataLayout::ANY) {
-    tmp_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout();
-  }
 
   if (tmp_layout == DataLayout::ANY) {
     tmp_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout();
   }
 
+  VLOG(4) << "src_layout: " << src_layout << ", tmp_layout: " << tmp_layout;
+
   // NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
   // data_transfer.cc
   if (!x.IsInitialized() && src_layout == DataLayout::ONEDNN &&

diff --git a/paddle/phi/kernels/onednn/pad3d_kernel.cc b/paddle/phi/kernels/onednn/pad3d_kernel.cc
@@ -38,6 +38,15 @@ KernelKey Pad3dGetKernelTypeForVar(const GetKernelTypeForVarContext* ctx) {
       tensor.place(), tensor.layout(), expected_kernel_type.dtype());
 }
 
+bool Pad3dCheckIfOneDNNSupport(const KernelContext* ctx) {
+  // only constant mode and non-blocked layouts are supported for oneDNN
+  if (ctx->AttrAt<std::string>(1) == "constant" &&
+      ctx->InputAt<phi::DenseTensor>(0).mem_desc().get_inner_nblks() == 0) {
+    return true;
+  }
+  return false;
+}
+
 template <typename T, typename Context>
 void Pad3dKernel(const Context& dev_ctx,
                  const DenseTensor& x,
@@ -58,4 +67,5 @@ PD_REGISTER_KERNEL(pad3d,
                    phi::dtype::bfloat16,
                    float) {
   kernel->get_kerneltype_forvar_fn_ = phi::Pad3dGetKernelTypeForVar;
+  kernel->check_if_onednn_kernel_support_ = phi::Pad3dCheckIfOneDNNSupport;
 }
diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py
@@ -342,6 +342,26 @@ def inference_config_str(self, config) -> str:
         return str(dic)
 
 
+class PirMkldnnAutoScanTest(MkldnnAutoScanTest):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def run_test_config(
+        self, model, params, prog_config, pred_config, feed_data
+    ) -> Dict[str, np.ndarray]:
+        """
+        Test a single case.
+        """
+        paddle.set_flags({'FLAGS_enable_pir_in_executor': True})
+        pred_config.switch_ir_optim(False)
+        pred_config.enable_new_executor()
+        result = super().run_test_config(
+            model, params, prog_config, pred_config, feed_data
+        )
+        paddle.set_flags({'FLAGS_enable_pir_in_executor': False})
+        return result
+
+
 class PassAutoScanTest(AutoScanTest):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py
@@ -344,6 +344,7 @@ def _cast(self) -> None:
 
 def create_fake_model(program_config):
     '''Create a Paddle model(in memory) according to the given config.'''
+    paddle.set_flags({'FLAGS_enable_pir_in_executor': False})
     program_config = copy.deepcopy(program_config)
     program_config._cast()
     paddle.enable_static()