From eb0d3265695873b1e675b457205cc7ffe27d55c1 Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Tue, 6 Feb 2024 06:45:32 +0000
Subject: [PATCH 1/9] pir onednn support fusion_gru,layer_norm,matmul

---
 .../dialect/operator/ir/ops_onednn_extra.yaml  | 18 ++++++++++--------
 paddle/phi/api/yaml/op_compat.yaml             |  2 ++
 2 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index 290b20357824eb..606fdecbba7b99 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -109,7 +109,8 @@
 
 # - op : fused_transpose
 
-# - op : fusion_gru
+- op : fusion_gru
+  extra_args : str mkldnn_data_type="float32", float scale_data=1.0, float shift_data=0.0, float[] scale_weights={1.0f}
 
 # - op : fusion_lstm
 
@@ -125,7 +126,8 @@
 
 - op : hardswish_grad
 
-# - op : layer_norm
+- op : layer_norm
+  extra_args : str mkldnn_data_type="float32", bool is_test=false
 
 - op : leaky_relu
 
@@ -141,13 +143,13 @@
   extra_args : bool is_test=false
   data_format_tensors : x, out, mid_out, out_grad
 
-# - op : matmul
-#   extra_args : str mkldnn_data_type="float32"
-#   layout_transform :
-#     arg_name: cur_paddle_data_layout
-#     tensors: x, y
+- op : matmul
+  extra_args : str mkldnn_data_type="float32"
+  layout_transform :
+    arg_name: cur_paddle_data_layout
+    tensors: x, y
 
-# - op : matmul_grad
+- op : matmul_grad
 
 # - op : matmul_with_flatten
 
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index cd296f7c302b93..72739fabe1a253 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1429,6 +1429,8 @@
     batched_input : BatchedInput
     batched_out : BatchedOut
     hidden : Hidden
+  attrs :
+    {scale_data : Scale_data, shift_data : Shift_data, scale_weights : Scale_weights}
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}']
 

From 3b52a23ec952a16fef2f58dab2f3a50a4d53ca8f Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Tue, 6 Feb 2024 08:08:27 +0000
Subject: [PATCH 2/9] refine

---
 .../operators/mkldnn/layer_norm_mkldnn_op.cc  |   14 +-
 .../dialect/operator/ir/ops_onednn_extra.yaml |    6 +-
 .../fluid/pir/dialect/operator/utils/utils.cc |    1 +
 test/legacy_test/op_compat.yaml               | 3758 +++++++++++++++++
 test/legacy_test/test_fusion_gru_op.py        |    4 +-
 test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py |    4 +-
 test/mkldnn/test_fusion_gru_int8_mkldnn_op.py |    6 +-
 test/mkldnn/test_fusion_gru_mkldnn_op.py      |    9 +
 test/mkldnn/test_layer_norm_bf16_mkldnn_op.py |    2 +
 test/mkldnn/test_layer_norm_mkldnn_op.py      |    5 +
 test/mkldnn/test_matmul_v2_mkldnn_op.py       |   12 +-
 11 files changed, 3806 insertions(+), 15 deletions(-)
 create mode 100755 test/legacy_test/op_compat.yaml

diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index d2b715a5f56e6a..1f700c0630b1d6 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -71,7 +71,7 @@ class LayerNormOneDNNHandler
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -143,8 +143,10 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(layer_norm,
-                   MKLDNN,
-                   ::phi::CPUPlace,
-                   ops::LayerNormMKLDNNOpKernel<float>,
-                   ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
+
+PD_REGISTER_STRUCT_KERNEL(layer_norm,
+                          OneDNN,
+                          ONEDNN,
+                          ops::LayerNormMKLDNNOpKernel,
+                          float,
+                          paddle::platform::bfloat16) {}
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index 606fdecbba7b99..889d79bd6987c7 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -145,11 +145,11 @@
 
 - op : matmul
   extra_args : str mkldnn_data_type="float32"
-  layout_transform :
-    arg_name: cur_paddle_data_layout
-    tensors: x, y
+  data_format_tensors : x, y
 
 - op : matmul_grad
+  extra_args : str mkldnn_data_type="float32"
+  data_format_tensors : x, y, out_grad
 
 # - op : matmul_with_flatten
 
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 332f1ba2248a4d..49a02f13e72d31 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -79,6 +79,7 @@ const std::unordered_set<std::string> LegacyOpList = {
     paddle::onednn::dialect::LrnGradOp::name(),
     paddle::onednn::dialect::QuantizeOp::name(),
     paddle::onednn::dialect::RequantizeOp::name(),
+    paddle::onednn::dialect::LayerNormOp::name(),
 #endif
     CReduceMinOp::name(),
     PushSparseV2Op::name()};
diff --git a/test/legacy_test/op_compat.yaml b/test/legacy_test/op_compat.yaml
new file mode 100755
index 00000000000000..026b8ca617e593
--- /dev/null
+++ b/test/legacy_test/op_compat.yaml
@@ -0,0 +1,3758 @@
+# All the configuration in this file are only for existing operators,
+# which cannot be modified in principle. There's no need to configure
+# this file for new operator.
+#
+# This file is used for two purposes:
+# 1. Configure the mapping relationship of parameter names of operator
+#    between the operators in ops.yaml and the old operators defined
+#    in fluid.
+# 2. Save the extra parameters in the OpMaker of operators temporarily,
+#    which will be removed in the future.
+
+# - op : rnn
+#   backward : rnn_grad
+#   extra :
+#     attrs : [bool is_test = false]
+
+- op : abs
+  backward : abs_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : accuracy
+  inputs :
+    {x : Out , indices : Indices, label: Label}
+  outputs :
+    {accuracy : Accuracy, correct : Correct, total : Total}
+
+- op : acos
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : acosh
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  backward : acosh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : adadelta_ (adadelta)
+  inputs :
+    {param : Param, grad: Grad, avg_squared_grad : AvgSquaredGrad, avg_squared_update : AvgSquaredUpdate, learning_rate : LearningRate, master_param : MasterParam }
+  outputs :
+    {param_out : ParamOut, moment_out : AvgSquaredGradOut, inf_norm_out : AvgSquaredUpdateOut, master_param_out : MasterParamOut}
+
+- op : adagrad_ (adagrad)
+  inputs :
+    { param : Param, grad : Grad, moment : Moment, learning_rate : LearningRate, master_param : MasterParam }
+  outputs :
+    { param_out : ParamOut, moment_out : MomentOut, master_param_out : MasterParamOut }
+
+- op : adam_ (adam)
+  inputs :
+    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
+  outputs :
+    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
+  scalar :
+    beta1 :
+      data_type : float
+      tensor_name : Beta1Tensor
+    beta2 :
+      data_type : float
+      tensor_name : Beta2Tensor
+    epsilon :
+      data_type : float
+      tensor_name : EpsilonTensor
+  manual_signature : [adam_]
+
+- op : adamax_ (adamax)
+  inputs :
+    {param : Param, grad: Grad, learning_rate : LearningRate, moment : Moment, inf_norm : InfNorm, beta1_pow : Beta1Pow, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, moment_out : MomentOut, inf_norm_out : InfNormOut, master_param_out : MasterParamOut}
+
+- op : adamw_ (adamw)
+  inputs :
+    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
+  outputs :
+    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
+  scalar :
+    beta1 :
+      data_type : float
+      tensor_name : Beta1Tensor
+    beta2 :
+      data_type : float
+      tensor_name : Beta2Tensor
+    epsilon :
+      data_type : float
+      tensor_name : EpsilonTensor
+
+- op : add (elementwise_add)
+  backward : add_grad (elementwise_add_grad), add_double_grad (elementwise_add_grad_grad), add_triple_grad (elementwise_add_triple_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  attrs :
+    {scale_x : Scale_x, scale_y : Scale_y, scale_out : Scale_out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+
+- op : add_n (sum)
+  inputs:
+    {inputs : X}
+  outputs:
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : addmm
+  backward : addmm_grad
+  inputs :
+    {input : Input, x : X, y : Y}
+  outputs :
+    out : Out
+  attrs :
+    {alpha : Alpha, beta : Beta}
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : affine_grid
+  backward : affine_grid_grad
+  inputs :
+    input : Theta
+  outputs :
+    output : Output
+  int_array:
+    output_shape :
+      data_type : int
+      tensor_name : OutputShape
+  extra :
+    attrs : [bool use_cudnn = true]
+
+- op : all (reduce_all)
+  inputs:
+    x : X
+  attrs:
+    { axis : dim,  keepdim : keep_dim}
+  outputs:
+    out : Out
+  manual_signature : [all]
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : allclose
+  inputs :
+    {x : Input, y : Other}
+  outputs :
+    out : Out
+  scalar :
+    rtol :
+      data_type : std::string
+      tensor_name : Rtol
+    atol :
+      data_type : std::string
+      tensor_name : Atol
+
+- op : amax (reduce_amax)
+  backward : amax_grad (reduce_amax_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs:
+    { axis : dim,  keepdim : keep_dim }
+  extra :
+    attrs : [bool use_mkldnn = false]
+  get_expected_kernel_type :
+    amax_grad : GetReduceGradExpectedKernelType
+  manual_signature : [amax]
+
+- op : amin (reduce_amin)
+  backward : amin_grad (reduce_amin_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs:
+    { axis : dim,  keepdim : keep_dim }
+  extra :
+    attrs : [bool use_mkldnn = false]
+  get_expected_kernel_type :
+    amin_grad : GetReduceGradExpectedKernelType
+  manual_signature : [amin]
+
+- op : angle
+  backward : angle_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : any (reduce_any)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs:
+    { axis : dim,  keepdim : keep_dim }
+  extra :
+    attrs : [bool use_mkldnn = false]
+  get_expected_kernel_type :
+    any : GetReduceOpUseInputPlaceExpectedKernelType
+  manual_signature : [any]
+
+- op : arange(range)
+  inputs :
+    {start : Start, end : End, step : Step}
+  outputs :
+    out : Out
+
+- op : argmax(arg_max)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar:
+    axis:
+      data_type : int64_t
+      support_tensor : true
+
+- op : argmin(arg_min)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar:
+    axis:
+      data_type : int64_t
+      support_tensor : true
+
+- op : argsort
+  inputs :
+    x : X
+  outputs :
+    out : Out
+    indices : Indices
+
+- op : array_to_tensor(tensor_array_to_tensor)
+  backward : tanh_shrink_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+    out_index : OutIndex
+
+- op : as_complex
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : as_real
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : asin
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : asinh
+  backward : asinh_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : assert
+  inputs :
+    {cond : Cond, data : Data}
+
+- op : assign
+  backward : assign_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  manual_signature : [assign, assign_grad]
+  get_expected_kernel_type :
+    assign : GetAssignExpectedKernelType
+
+- op : assign_value
+  outputs :
+    out : Out
+  manual_signature : [assign_value]
+
+- op : atan
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : atan2
+  backward : atan2_grad
+  inputs :
+    {x : X1, y : X2}
+  outputs :
+    out : Out
+
+- op : atanh
+  backward : atanh_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : auc
+  inputs :
+    {x : Predict, label : Label, stat_pos : StatPos, stat_neg : StatNeg, ins_tag_weight : InsTagWeight}
+  outputs :
+    {auc : AUC, stat_pos_out : StatPosOut, stat_neg_out : StatNegOut}
+
+- op : batch_norm
+  backward : batch_norm_grad, batch_norm_double_grad(batch_norm_grad_grad)
+  inputs:
+    x : X
+    mean : Mean
+    variance : Variance
+    scale : Scale
+    bias : Bias
+  outputs :
+    out : Y
+    mean_out: MeanOut
+    variance_out: VarianceOut
+    saved_mean: SavedMean
+    saved_variance: SavedVariance
+    reserve_space: ReserveSpace
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+
+- op : bce_loss
+  backward : bce_loss_grad
+  inputs :
+    {input : X, label : Label}
+  outputs :
+    out : Out
+
+- op : bernoulli
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : bicubic_interp (bicubic_interp_v2)
+  backward : bicubic_interp_grad (bicubic_interp_v2_grad)
+  inputs :
+    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
+  outputs :
+    output : Out
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : bilinear (bilinear_tensor_product)
+  backward: bilinear_grad (bilinear_tensor_product_grad)
+  inputs :
+    {x : X, y : Y,weight: Weight, bias: Bias}
+  outputs :
+    {out : Out}
+
+- op : bilinear_interp (bilinear_interp_v2)
+  backward : bilinear_interp_grad (bilinear_interp_v2_grad)
+  inputs :
+    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
+  outputs :
+    output : Out
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : bincount
+  inputs :
+    {x : X, weights : Weights}
+  outputs :
+    out : Out
+  scalar:
+    minlength:
+      data_type : int
+      support_tensor : true
+  get_expected_kernel_type :
+    bincount : GetBincountExpectedKernelType
+
+- op : bitwise_and
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+
+- op : bitwise_not
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out}
+
+- op : bitwise_or
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+
+- op : bitwise_xor
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+
+- op : bmm
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : bn_act_xpu
+  attrs:
+    data_format: data_layout
+
+- op : box_coder
+  inputs :
+    {prior_box : PriorBox , prior_box_var : PriorBoxVar, target_box: TargetBox}
+  outputs :
+    output_box : OutputBox
+
+- op : broadcast_tensors
+  backward : broadcast_tensors_grad
+  inputs :
+    input : X
+  outputs :
+    out : Out
+  drop_empty_grad : [input_grad]
+
+- op : c_concat
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : c_embedding
+  backward : c_embedding_grad
+  inputs :
+    {weight : W, x : Ids}
+  outputs :
+    out : Out
+
+- op : c_softmax_with_cross_entropy
+  backward : c_softmax_with_cross_entropy_grad
+  inputs :
+    {logits : Logits, label : Label}
+  outputs :
+    {softmax : Softmax, loss : Loss}
+
+- op : cast
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : ceil
+  backward : ceil_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : celu
+  backward : celu_grad, celu_double_grad(celu_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : check_finite_and_unscale_(check_finite_and_unscale)
+  inputs :
+    {x : X, scale: Scale}
+  outputs :
+    {out : Out, found_infinite: FoundInfinite}
+  get_expected_kernel_type :
+    check_finite_and_unscale_ : GetCheckFiniteAndUnscaleExpectedKernelType
+
+- op : cholesky
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : cholesky_solve
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : class_center_sample
+  inputs :
+    label : Label
+  outputs :
+    {remapped_label : RemappedLabel, sampled_local_class_center : SampledLocalClassCenter}
+
+- op : clip
+  backward : clip_grad, clip_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar :
+    min :
+      data_type : float
+      tensor_name : Min
+    max :
+      data_type :  float
+      tensor_name : Max
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : clip_by_norm
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : coalesce_tensor
+  inputs :
+    {input : Input}
+  outputs :
+    {output : Output, fused_output : FusedOutput}
+  attrs :
+    {size_of_dtype : user_defined_size_of_dtype}
+
+- op : complex
+  backward : complex_grad
+  inputs :
+    {real : X, imag : Y}
+  outputs :
+    out : Out
+
+- op : concat
+  backward : concat_grad, concat_double_grad
+  inputs:
+    x: X
+  outputs:
+    out: Out
+  attrs:
+    axis: axis
+  scalar :
+    axis :
+      data_type : int
+      tensor_name : AxisTensor
+  drop_empty_grad : [x_grad]
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
+  get_expected_kernel_type :
+    concat : GetConcatExpectedKernelType
+
+- op : conditional_block
+  backward : conditional_block_grad
+  extra :
+    attrs : ['str[] skip_eager_deletion_vars = {}']
+
+- op : conj
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : conv2d
+  backward : conv2d_grad, conv2d_grad_grad
+  inputs :
+    {input : Input, filter : Filter}
+  outputs :
+    out : Output
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_addto = false,
+             bool force_fp32_output = false,
+             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+  get_expected_kernel_type :
+    conv2d : GetConvExpectedKernelType
+
+- op : conv2d_transpose
+  backward : conv2d_transpose_grad, conv2d_transpose_double_grad (conv2d_transpose_grad_grad)
+  inputs :
+    {x : Input, filter : Filter, bias : Bias}
+  outputs :
+    out : Output
+  int_array :
+    output_size :
+      data_type : int
+      support_tensor : true
+  extra :
+    inputs : [bias]
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool force_fp32_output = false,
+             str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
+             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
+
+- op : conv3d
+  backward : conv3d_grad, conv3d_double_grad (conv3d_grad_grad)
+  inputs :
+    {input : Input, filter : Filter}
+  outputs :
+    out : Output
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
+             bool use_addto = false, bool fuse_residual_connection = false, bool force_fp32_output = false,
+             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+  get_expected_kernel_type :
+    conv3d : GetConvExpectedKernelType
+
+- op : conv3d_transpose
+  backward : conv3d_transpose_grad
+  inputs :
+    {x : Input, filter : Filter}
+  outputs :
+    out : Output
+  extra :
+    attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
+
+- op : cos
+  backward : cos_grad, cos_double_grad, cos_triple_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : cosh
+  backward : cosh_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : crop (crop_tensor)
+  backward : crop_grad (crop_tensor_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  int_array:
+    shape :
+      data_type : int
+      tensor_name : Shape
+      tensors_name : ShapeTensor
+    offsets :
+      data_type : int
+      tensor_name : Offsets
+      tensors_name : OffsetsTensor
+
+- op : cross
+  inputs :
+    {x : X, y : Y}
+  attrs :
+    axis : dim
+  outputs :
+    out : Out
+
+- op : cross_entropy_with_softmax (softmax_with_cross_entropy)
+  backward : cross_entropy_with_softmax_grad (softmax_with_cross_entropy_grad)
+  inputs :
+    {input : Logits, label : Label}
+  outputs :
+    {softmax : Softmax, loss : Loss}
+
+- op : cumprod
+  backward : cumprod_grad
+  inputs :
+    x : X
+  attrs :
+    dim : dim
+  outputs :
+    out : Out
+
+- op : cumsum
+  backward: cumsum_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar:
+    axis:
+      data_type : int
+      support_tensor : true
+
+- op : data_norm
+  backward : data_norm_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : decode_jpeg
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : deformable_conv
+  backward : deformable_conv_grad
+  inputs :
+    {x : Input, offset : Offset, filter : Filter, mask : Mask}
+  outputs :
+    out : Output
+
+- op : depthwise_conv2d
+  backward : depthwise_conv2d_grad, depthwise_conv2d_double_grad (depthwise_conv2d_grad_grad)
+  inputs :
+    {input : Input, filter : Filter}
+  outputs :
+    out : Output
+  attrs :
+    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
+             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+  get_expected_kernel_type :
+    depthwise_conv2d : GetConvExpectedKernelType
+
+- op : depthwise_conv2d_transpose
+  backward : depthwise_conv2d_transpose_grad
+  inputs :
+    {x : Input, filter : Filter, bias: Bias}
+  outputs :
+    out : Output
+  int_array :
+    output_size :
+      data_type : int
+      support_tensor : true
+  extra :
+    inputs : [bias]
+    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool force_fp32_output = false,
+             str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
+             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
+
+- op : dequantize
+  inputs :
+    input : Input
+  outputs :
+    output : Output
+  attrs :
+    {scale : Scale, shift : Shift}
+
+- op : dequantize_linear
+  extra :
+    attrs : [float moving_rate = 0.9]
+
+- op : det (determinant)
+  backward : det_grad (determinant_grad)
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- op : diag (diag_v2)
+  backward : diag_grad (diag_v2_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : diag_embed
+  inputs :
+    input : Input
+  outputs :
+    out : Out
+
+- op : diagonal
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- op : digamma
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : dirichlet
+  inputs :
+    alpha : Alpha
+  outputs :
+    out : Out
+
+- op : dist
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : distributed_push_sparse
+  extra :
+    attrs : ['int[] slots = {}']
+
+- op : divide (elementwise_div)
+  backward : divide_grad (elementwise_div_grad)
+  inputs :
+    {x: X, y : Y}
+  outputs :
+    out: Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : dot
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : dropout
+  backward : dropout_grad
+  inputs :
+    x : X
+    seed_tensor : Seed
+  outputs :
+    out : Out
+    mask : Mask
+  attrs :
+    p : dropout_prob
+    is_test : is_test
+    mode : dropout_implementation
+    seed : seed
+    fix_seed : fix_seed
+  extra :
+    attrs : [bool fix_seed = false, int seed = 0]
+
+- op : dropout_nd
+  backward : dropout_nd_grad
+  extra :
+    attrs : [bool fix_seed = false, int seed = 0]
+
+- op : edit_distance
+  inputs :
+    hyps : Hyps
+    refs : Refs
+    hypslength : HypsLength
+    refslength : RefsLength
+  outputs :
+    sequencenum : SequenceNum
+    out : Out
+
+- op : eig
+  inputs :
+    x : X
+  outputs :
+    out_w : Eigenvalues
+    out_v : Eigenvectors
+
+- op : eigh
+  inputs :
+    x : X
+  outputs :
+    out_w : Eigenvalues
+    out_v : Eigenvectors
+
+- op : eigvals
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : eigvalsh
+  backward : eigvalsh_grad
+  inputs :
+    {x : X}
+  outputs :
+    {eigenvalues : Eigenvalues, eigenvectors : Eigenvectors}
+  attrs :
+    uplo : UPLO
+
+- op : einsum
+  backward : einsum_grad
+  inputs :
+    x : Operands
+  outputs:
+    {out : Out, inner_cache: InnerCache, xshape : XShape}
+  drop_empty_grad: [x_grad]
+  extra:
+    outputs: [inner_cache, xshape]
+
+- op : elementwise_pow
+  backward : elementwise_pow_grad
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [elementwise_pow]
+
+- op : elu
+  backward : elu_grad, elu_double_grad (elu_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : embedding (lookup_table_v2)
+  backward : embedding_grad (lookup_table_v2_grad)
+  inputs :
+    {x : Ids, weight : W}
+  outputs :
+    out : Out
+  attrs :
+   sparse : is_sparse
+  manual_signature : [embedding_grad]
+  extra :
+    attrs : [bool is_sparse = false, bool is_distributed = false, bool remote_prefetch = false,
+             int trainer_id = 0, int slot = 0, 'int64_t[] height_sections = {}', 'str[] epmap = {}',
+             'str[] table_names = {}']
+
+- op : empty
+  outputs :
+    out : Out
+  int_array:
+    shape :
+      data_type : int64_t
+      tensor_name : ShapeTensor
+      tensors_name : ShapeTensorList
+
+- op : equal
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : equal_all
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : erf
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : erfinv
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : exp
+  backward : exp_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : expand (expand_v2)
+  backward : expand_grad (expand_v2_grad), expand_double_grad(expand_v2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   shape : shape
+  outputs :
+    out : Out
+  int_array:
+    shape :
+      data_type : int
+      tensor_name : Shape
+      tensors_name : expand_shapes_tensor
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+  manual_signature : [expand, expand_grad]
+
+- op : expand_as (expand_as_v2)
+  backward : expand_as_grad (expand_as_v2_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : expm1
+  backward : expm1_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : exponential_ (exponential)
+  backward : exponential__grad (exponential_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    lam : lambda
+
+- op : eye
+  outputs :
+    out : Out
+  scalar :
+    num_rows :
+      support_tensor : true
+    num_columns :
+      support_tensor : true
+
+- op : fake_channel_wise_quantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_channel_wise_quantize_dequantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_dequantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_dequantize_moving_average_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_moving_average_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_range_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fc
+  inputs :
+    input : Input
+    w : W
+    bias : Bias
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE = true, bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", float Scale_in = 1.0f, "float[] Scale_weights = {1.0f}", float Scale_out = 1.0f, bool force_fp32_output = false]
+
+- op : feed
+  outputs: {out: Out}
+
+- op : fft_c2c
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op : fft_c2r
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op : fft_r2c
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op : fill (fill_any)
+  backward : fill_grad (fill_any_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar :
+    value :
+      data_type : float
+      support_tensor : true
+
+- op : fill_diagonal
+  backward : fill_diagonal_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : fill_diagonal_tensor
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : flatten (flatten_contiguous_range)
+  backward : flatten_grad (flatten_contiguous_range_grad)
+  inputs :
+    x : X
+  outputs :
+    {out : Out, xshape : XShape}
+  attrs :
+    {start_axis : start_axis, stop_axis : stop_axis}
+  extra :
+    outputs : [xshape]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+  manual_signature : [flatten, flatten_grad]
+
+- op : flip
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : floor
+  backward : floor_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : floor_divide (elementwise_floordiv)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [floor_divide]
+
+- op : fmax (elementwise_fmax)
+  backward : fmax_grad (elementwise_fmax_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [fmax]
+
+- op : fmin (elementwise_fmin)
+  backward : fmin_grad (elementwise_fmin_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [fmin]
+
+- op : fold
+  inputs :
+    x : X
+  outputs :
+    out : Y
+
+- op : frame
+  backward : frame_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : frobenius_norm
+  backward : frobenius_norm_grad
+  inputs:
+    x : X
+  attrs:
+    { axis : dim,  keepdim : keep_dim}
+  outputs:
+    out : Out
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
+  get_expected_kernel_type :
+    frobenius_norm : GetReduceExpectedKernelType
+    frobenius_norm_grad : GetReduceGradExpectedKernelType
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : full (fill_constant)
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : full_like (fill_any_like)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar :
+    value :
+      data_type : float
+      support_tensor : true
+
+- op : fused_attention
+  backward: fused_attention_grad
+  inputs:
+    x: X
+    ln_scale: LnScale
+    ln_bias: LnBias
+    qkv_weight: QKVW
+    qkv_bias: QKVBias
+    cache_kv: CacheKV
+    src_mask: SrcMask
+    out_linear_weight: OutLinearW
+    out_linear_bias: OutLinearBias
+    ln_scale_2: Ln2Scale
+    ln_bias_2: Ln2Bias
+  outputs:
+    ln_mean: LnMean
+    ln_var: LnVariance
+    ln_out: LnOut
+    qkv_out: QKVOut
+    qkv_bias_out: QKVBiasOut
+    transpose_out_2: TransposeOut2
+    qk_out: QKOut
+    qktv_out: QKTVOut
+    softmax_out: SoftmaxOut
+    attn_dropout_mask_out: AttnDropoutMaskOut
+    attn_dropout_out: AttnDropoutOut
+    src_mask_out: SrcMaskOut
+    fmha_out: FMHAOut
+    out_linear_out: OutLinearOut
+    dropout_mask_out: DropoutMaskOut
+    ln_mean_2: Ln2Mean
+    ln_var_2: Ln2Variance
+    bias_dropout_residual_out: BiasDropoutResidualOut
+    cache_kv_out: CacheKVOut
+    out: Y
+
+- op : fused_batch_norm_act
+  backward : fused_batch_norm_act_grad
+  inputs:
+    x : X
+    mean : Mean
+    variance : Variance
+    scale : Scale
+    bias : Bias
+  outputs :
+    out : Y
+    mean_out: MeanOut
+    variance_out: VarianceOut
+    saved_mean: SavedMean
+    saved_variance: SavedVariance
+    reserve_space: ReserveSpace
+
+- op : fused_bias_dropout_residual_layer_norm
+  backward : fused_bias_dropout_residual_layer_norm_grad
+  inputs :
+    x : X
+    residual : Residual
+    bias : Bias
+    ln_scale : LnScale
+    ln_bias : LnBias
+  outputs :
+    bias_dropout_residual_out : BiasDropoutResidualOut
+    dropout_mask_out : DropoutMaskOut
+    ln_mean : LnMean
+    ln_variance : LnVariance
+    y : Y
+
+- op : fused_bn_add_activation_ (fused_bn_add_activation)
+  backward : fused_bn_add_activation_grad
+  inputs:
+    x : X
+    z : Z
+    mean : Mean
+    variance : Variance
+    scale : Scale
+    bias : Bias
+  outputs :
+    out : Y
+    mean_out: MeanOut
+    variance_out: VarianceOut
+    saved_mean: SavedMean
+    saved_variance: SavedVariance
+    reserve_space: ReserveSpace
+
+- op : fused_conv2d
+  inputs :
+    {input : Input, filter : Filter, bias : Bias, residual_param : ResidualData}
+  outputs :
+    {output : Output}
+  attrs :
+    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
+  extra :
+    attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
+             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
+
+- op : fused_conv2d_add_act
+  inputs :
+    input : Input
+    filter : Filter
+    bias : Bias
+    residual_data : ResidualData
+  outputs :
+    output : Output
+    outputs : Outputs
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_beta = 0.0f, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false]
+  get_expected_kernel_type :
+    fused_conv2d_add_act : GetConvExpectedKernelType
+
+- op : fused_conv3d
+  inputs :
+    {input : Input, filter : Filter, bias : Bias, residual_param : ResidualData}
+  outputs :
+    {output : Output}
+  attrs :
+    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
+  extra :
+    attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
+             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
+
+- op : fused_embedding_eltwise_layernorm
+  inputs :
+    ids : Ids
+    embs : Embs
+    bias : Bias
+    scale : Scale
+  outputs :
+    out : Out
+
+- op : fused_fc_elementwise_layernorm
+  inputs :
+    x : X
+    w : W
+    y : Y
+    bias0 : Bias0
+    scale : Scale
+    bias1 : Bias1
+  outputs :
+    out : Out
+    mean : Mean
+    variance : Variance
+
+- op : fused_feedforward
+  backward: fused_feedforward_grad
+  inputs:
+    x: X
+    dropout1_seed: Dropout1Seed
+    dropout2_seed: Dropout2Seed
+    linear1_weight: Linear1Weight
+    linear1_bias: Linear1Bias
+    linear2_weight: Linear2Weight
+    linear2_bias: Linear2Bias
+    ln1_scale: Ln1Scale
+    ln1_bias: Ln1Bias
+    ln2_scale: Ln2Scale
+    ln2_bias: Ln2Bias
+  attrs:
+    dropout1_seed_val: dropout1_seed
+    dropout2_seed_val: dropout2_seed
+    dropout1_prob: dropout1_rate
+    dropout2_prob: dropout2_rate
+  outputs:
+    out: Out
+    dropout1_mask: Dropout1Mask
+    dropout2_mask: Dropout2Mask
+    ln1_mean: Ln1Mean
+    ln1_variance: Ln1Variance
+    ln2_mean: Ln2Mean
+    ln2_variance: Ln2Variance
+    linear1_out: Linear1Out
+    ln1_out: Ln1Out
+    dropout1_out: Dropout1Out
+    dropout2_out: Dropout2Out
+
+- op : fused_gemm_epilogue
+  inputs:
+    {x : X, y : Y, bias : Bias}
+  outputs :
+    {out : Out, reserve_space: ReserveSpace}
+
+- op : fused_gemm_epilogue_grad
+  inputs:
+    {x : X, y : Y, reserve_space: ReserveSpace, out_grad : DOut}
+  outputs :
+    {x_grad : DX, y_grad : DY, bias_grad : DBias}
+
+- op : fused_transpose
+  extra :
+    attrs : [str data_format = "AnyLayout"]
+
+- op : fusion_gru
+  inputs :
+    x : X
+    h0 : H0
+    weight_x : WeightX
+    weight_h : WeightH
+    bias : Bias
+  outputs :
+    reordered_h0 : ReorderedH0
+    xx : XX
+    batched_input : BatchedInput
+    batched_out : BatchedOut
+    hidden : Hidden
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}']
+
+- op : fusion_lstm
+  extra :
+    attrs : [bool use_mkldnn = true, str mkldnn_data_type = "float32"]
+
+- op : fusion_repeated_fc_relu
+  inputs :
+    x : X
+    w : W
+    bias : Bias
+  outputs :
+    relu_out : ReluOut
+    out : Out
+
+- op : fusion_seqconv_eltadd_relu
+  inputs :
+    x : X
+    filter : Filter
+    bias : Bias
+  outputs :
+    out : Out
+    col_mat : ColMat
+  attrs :
+    context_length : contextLength
+    context_start : contextStart
+    context_stride : contextStride
+
+- op : fusion_seqexpand_concat_fc
+  inputs :
+    x : X
+    fc_weight : FCWeight
+    fc_bias : FCBias
+  outputs :
+    out : Out
+    fc_out : FCOut
+
+- op : fusion_transpose_flatten_concat
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : gather
+  backward : gather_grad
+  inputs :
+    {x : X, index : Index}
+  outputs :
+    out : Out
+  scalar :
+    axis :
+      data_type : int
+      tensor_name : Axis
+
+- op : gather_nd
+  backward : gather_nd_grad
+  inputs :
+    {x : X, index : Index}
+  outputs :
+    out : Out
+
+- op : gather_tree
+  inputs :
+    {ids : Ids, parents : Parents}
+  outputs :
+    out : Out
+
+- op : gaussian (gaussian_random)
+  outputs :
+    out : Out
+  int_array:
+    shape :
+      data_type : int64_t
+      tensor_name : ShapeTensor
+      tensors_name : ShapeTensorList
+  extra :
+    attrs : [bool use_mkldnn = false]
+  manual_signature : [gaussian]
+
+- op : gelu
+  backward : gelu_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : generate_proposals(generate_proposals_v2)
+  inputs :
+    {scores : Scores, bbox_deltas : BboxDeltas, im_shape : ImShape, anchors : Anchors, variances : Variances}
+  outputs :
+    {rpn_rois : RpnRois, rpn_roi_probs : RpnRoiProbs, rpn_rois_num : RpnRoisNum}
+  attrs :
+    {pre_nms_top_n : pre_nms_topN, post_nms_top_n : post_nms_topN}
+
+- op : grad_add
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : graph_khop_sampler
+  inputs :
+    {row : Row, colptr : Col_Ptr, x : X, eids : Eids}
+  outputs :
+    {out_src : Out_Src, out_dst : Out_Dst, sample_index : Sample_Index, reindex_x : Reindex_X, out_eids : Out_Eids}
+
+- op : graph_sample_neighbors
+  inputs :
+    {row : Row, colptr : Col_Ptr, x : X, eids : Eids, perm_buffer : Perm_Buffer}
+  outputs :
+    {out : Out, out_count : Out_Count, out_eids : Out_Eids}
+
+- op : greater_equal
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : greater_than
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : grid_sample(grid_sampler)
+  backward : grid_sample_grad (grid_sampler_grad)
+  inputs :
+    {x : X, grid : Grid}
+  outputs :
+    out : Output
+  extra :
+    attrs : [bool use_cudnn = true]
+
+- op : group_norm
+  inputs :
+    x : X
+    scale : Scale
+    bias : Bias
+  outputs :
+    y : Y
+    mean : Mean
+    variance : Variance
+  attrs:
+    data_format: data_layout
+
+- op : gru
+  backward : gru_grad
+  extra :
+    attrs : [bool is_test = false]
+
+- op : gumbel_softmax
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : hardshrink (hard_shrink)
+  backward : hardshrink_grad (hard_shrink_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : hardsigmoid (hard_sigmoid)
+  backward : hardsigmoid_grad (hard_sigmoid_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : hardswish (hard_swish)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  backward : hardswish_grad (hard_swish_grad)
+  extra :
+    attrs : [bool use_mkldnn = false]
+  manual_signature : [hardswish]
+
+- op : hardtanh (brelu)
+  backward : hardtanh_grad (brelu_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : heaviside (elementwise_heaviside)
+  backward : heaviside_grad (elementwise_heaviside_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+
+- op : histogram
+  inputs :
+    input : X
+  outputs :
+    out : Out
+
+- op : hsigmoid_loss(hierarchical_sigmoid)
+  backward: hsigmoid_loss_grad(hierarchical_sigmoid_grad)
+  inputs:
+   {x: X, w: W, label: Label, bias: Bias, path: PathTable, code: PathCode}
+  outputs:
+   {out: Out, pre_out: PreOut, w_out: W_Out}
+
+- op : huber_loss
+  backward : huber_loss_grad
+  inputs :
+    {input : X, label : Y}
+  outputs :
+    {out : Out, residual : Residual}
+
+- op : imag
+  backward : imag_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : increment
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : index_add
+  inputs :
+    {x : X, index : Index, add_value : AddValue}
+  outputs :
+    out : Out
+
+- op : index_sample
+  inputs :
+    {x : X, index : Index}
+  outputs :
+    out : Out
+
+- op : index_select
+  inputs :
+    {x : X, index : Index}
+  outputs :
+    out : Out
+  attrs :
+    axis : dim
+
+- op : instance_norm
+  inputs :
+    x : X
+    scale : Scale
+    bias : Bias
+  outputs :
+    y : Y
+    saved_mean : SavedMean
+    saved_variance : SavedVariance
+  extra:
+    outputs: [ saved_mean, saved_variance ]
+  get_expected_kernel_type:
+    instance_norm: GetInstanceNormExpectedKernelType
+
+- op : inverse
+  inputs :
+    x : Input
+  outputs :
+    out : Output
+
+- op : is_empty
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : isclose
+  inputs :
+    {x : Input, y : Other}
+  outputs :
+    out : Out
+  scalar :
+    rtol :
+      data_type : std::string
+      tensor_name : Rtol
+    atol :
+      data_type : std::string
+      tensor_name : Atol
+
+- op : isfinite (isfinite_v2)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : isinf (isinf_v2)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : isnan (isnan_v2)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : kldiv_loss
+  backward : kldiv_loss_grad
+  inputs :
+    {x : X, label : Target}
+  outputs :
+    out : Loss
+
+- op : kron
+  backward : kron_grad
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  complex_promote : [X, Y]
+
+- op : kthvalue
+  inputs :
+    x : X
+  outputs :
+    {out : Out, indices : Indices}
+
+- op : label_smooth
+  inputs :
+    {label : X, prior_dist : PriorDist}
+  outputs :
+    out : Out
+
+- op : lamb_ (lamb)
+  inputs :
+    {param : Param, grad : Grad, learning_rate : LearningRate, moment1 : Moment1, moment2 : Moment2, beta1_pow : Beta1Pow, beta2_pow : Beta2Pow, master_param : MasterParam, skip_update : SkipUpdate}
+  outputs :
+    {param_out : ParamOut, moment1_out : Moment1Out, moment2_out : Moment2Out, beta1_pow_out : Beta1PowOut, beta2_pow_out : Beta2PowOut, master_param_outs : MasterParamOut}
+
+- op : layer_norm
+  backward : layer_norm_grad
+  inputs :
+    x : X
+    scale : Scale
+    bias : Bias
+  outputs :
+    out : Y
+    mean : Mean
+    variance : Variance
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+  get_expected_kernel_type :
+    layer_norm : GetLayerNormExpectedKernelType
+
+- op : leaky_relu
+  backward : leaky_relu_grad, leaky_relu_double_grad (leaky_relu_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs:
+    negative_slope : alpha
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : lerp
+  backward : lerp_grad
+  inputs :
+    {x : X, y : Y, weight : Weight}
+  outputs :
+    out : Out
+
+- op : less_equal
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : less_than
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : lgamma
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : linear_interp (linear_interp_v2)
+  backward : linear_interp_grad (linear_interp_v2_grad)
+  inputs :
+    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
+  outputs :
+    output : Out
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : linspace
+  inputs :
+    {start : Start, stop : Stop, number : Num}
+  outputs :
+    out : Out
+
+- op : log
+  backward : log_grad, log_double_grad (log_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log10
+  backward : log10_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log1p
+  backward : log1p_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log2
+  backward : log2_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log_loss
+  backward : log_loss_grad
+  inputs :
+    {input : Predicted, label : Labels}
+  outputs :
+    out : Loss
+
+- op : log_softmax
+  backward : log_softmax_grad
+  inputs :
+    x : X
+  outputs :
+    out: Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : logcumsumexp
+  backward : logcumsumexp_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : logical_and
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : logical_not
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : logical_or
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : logical_xor
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : logit
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : logsigmoid
+  backward : logsigmoid_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : logsumexp
+  backward : logsumexp_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : lrn
+  backward : lrn_grad
+  inputs :
+    x : X
+  outputs :
+    {out : Out, mid_out : MidOut}
+  extra :
+    attrs : [bool use_mkldnn = false, bool is_test = false]
+
+- op : lstsq
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {solution : Solution, residuals : Residuals, rank : Rank, singular_values : SingularValues}
+  scalar :
+    rcond :
+      data_type : float
+      support_tensor : true
+
+- op : lu_unpack
+  backward : lu_unpack_grad
+  inputs :
+    {x : X, y : Pivots}
+  outputs :
+    {pmat : Pmat, l : L, u : U}
+
+- op : margin_cross_entropy
+  backward : margin_cross_entropy_grad
+  inputs:
+    {logits : Logits, label : Label}
+  outputs:
+    {softmax : Softmax, loss : Loss}
+
+- op : masked_select
+  inputs :
+    {x : X, mask : Mask}
+  outputs :
+    out : Y
+
+- op : matmul (matmul_v2)
+  backward : matmul_grad (matmul_v2_grad), matmul_double_grad (matmul_v2_grad_grad), matmul_triple_grad (matmul_v2_triple_grad)
+  inputs :
+    {x : X, y : Y}
+  attrs :
+    {transpose_x : trans_x, transpose_y : trans_y}
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+  complex_promote : [X, Y]
+
+- op : matmul_with_flatten (mul)
+  backward : matmul_with_flatten_grad (mul_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}',
+             float scale_out = 1.0f, bool force_fp32_output = false]
+
+- op : matrix_nms
+  inputs :
+    {bboxes : BBoxes, scores : Scores}
+  outputs :
+    {out : Out, index : Index, roisnum : RoisNum}
+  get_expected_kernel_type :
+    matrix_nms : GetMatrixNmsExpectedKernelType
+
+- op : matrix_power
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : matrix_rank
+  inputs :
+    {x : X, tol_tensor : TolTensor}
+  outputs :
+    out : Out
+  manual_signature : [matrix_rank]
+
+- op : max (reduce_max)
+  backward : max_grad (reduce_max_grad)
+  inputs:
+    x : X
+  attrs:
+    { axis : dim,  keepdim : keep_dim}
+  outputs:
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
+  get_expected_kernel_type :
+    max : GetReduceExpectedKernelType
+    max_grad : GetReduceGradExpectedKernelType
+  manual_signature : [max]
+
+- op : max_pool2d_with_index
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out, mask : Mask}
+  attrs :
+    kernel_size : ksize
+
+- op : max_pool3d_with_index
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out, mask : Mask}
+  attrs :
+    kernel_size : ksize
+
+- op : maximum (elementwise_max)
+  backward : maximum_grad (elementwise_max_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [maximum]
+
+- op : maxout
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : mean (reduce_mean)
+  backward : mean_grad (reduce_mean_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    {axis : dim, keepdim : keep_dim}
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : mean_all (mean)
+  backward : mean_all_grad (mean_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : merge_selected_rows
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : merged_adam_
+  inputs :
+    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam}
+  outputs :
+    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
+  scalar :
+    beta1 :
+      data_type : float
+      support_tensor : true
+    beta2 :
+      data_type : float
+      support_tensor : true
+    epsilon :
+      data_type : float
+      support_tensor : true
+
+- op : merged_momentum_ (merged_momentum)
+  inputs :
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
+
+- op : meshgrid
+  backward : meshgrid_grad
+  inputs :
+    inputs : X
+  outputs :
+    out : Out
+  drop_empty_grad : [inputs_grad]
+
+- op : min (reduce_min)
+  backward : min_grad (reduce_min_grad)
+  inputs:
+    x : X
+  outputs:
+    out : Out
+  attrs:
+    { axis : dim,  keepdim : keep_dim}
+  extra :
+    attrs : [bool use_mkldnn = false]
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
+  get_expected_kernel_type :
+    min : GetReduceExpectedKernelType
+    min_grad : GetReduceGradExpectedKernelType
+  manual_signature : [min]
+
+- op : minimum (elementwise_min)
+  backward : minimum_grad (elementwise_min_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [minimum]
+
+- op : mish
+  backward : mish_grad
+  inputs:
+    {x : X, lambda : threshold}
+  outputs:
+    out: Out
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : mode
+  backward : mode_grad
+  inputs :
+    x : X
+  outputs :
+    {out : Out, indices : Indices}
+
+- op : momentum_ (momentum)
+  inputs :
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
+
+- op : multi_dot
+  backward : multi_dot_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  drop_empty_grad : [x_grad]
+
+- op : multiclass_nms3
+  inputs :
+    {bboxes : BBoxes, scores : Scores, rois_num : RoisNum}
+  outputs :
+    {out : Out, index : Index, nms_rois_num : NmsRoisNum}
+
+- op : multihead_matmul
+  inputs :
+    {input : Input, w : W, bias : Bias, bias_qk : BiasQK}
+  outputs :
+    out : Out
+  attrs :
+    {transpose_q : transpose_Q,  transpose_k : transpose_K, transpose_v : transpose_V}
+
+- op : multinomial
+  inputs :
+    {x : X}
+  outputs :
+    out : Out
+  scalar :
+    num_samples :
+      data_type : int
+      support_tensor : true
+
+- op : multiplex
+  backward : multiplex_grad
+  inputs :
+    {inputs : X, index : Ids}
+  outputs :
+    out : Out
+  drop_empty_grad : [inputs_grad]
+
+- op : multiply (elementwise_mul)
+  backward : multiply_grad (elementwise_mul_grad)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : mv
+  inputs :
+    {x : X, vec : Vec}
+  outputs :
+    out : Out
+
+- op : nanmedian
+  backward : nanmedian_grad
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out, medians : MedianIndex}
+  int_array:
+    axis:
+      data_type : int
+  extra:
+    outputs : [medians]
+
+- op : nce
+  backward : nce_grad
+  extra :
+    attrs : [int trainer_id = 0, 'int64_t[] height_sections = {}', 'str[] epmap = {}',
+             'str[] table_names = {}', 'int[] custom_neg_classes = {}']
+
+- op : nearest_interp (nearest_interp_v2)
+  backward : nearest_interp_grad (nearest_interp_v2_grad)
+  inputs :
+    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
+  outputs :
+    output : Out
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : nll_loss
+  backward : nll_loss_grad
+  inputs :
+    {input : X, label : Label, weight : Weight}
+  outputs :
+    {out : Out, total_weight : Total_weight}
+
+- op : nms
+  inputs :
+    x : Boxes
+  outputs :
+    out : KeepBoxesIdxs
+  attrs :
+    threshold : iou_threshold
+
+- op : nonzero (where_index)
+  inputs :
+    condition : Condition
+  outputs :
+    out : Out
+
+- op : norm
+  backward : norm_grad
+  inputs :
+    x : X
+  outputs :
+    {out : Out, norm : Norm}
+  extra :
+    outputs : [norm]
+
+- op : not_equal
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : numel(size)
+  inputs :
+    x : Input
+  outputs :
+    size : Out
+
+- op : one_hot (one_hot_v2)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar :
+    depth :
+      data_type : int
+      tensor_name : depth_tensor
+
+- op : overlap_add
+  backward : overlap_add_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : p_norm
+  backward: p_norm_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : pad
+  backward : pad_grad, pad_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar:
+    pad_value:
+      data_type : float
+      support_tensor : true
+
+- op : pad2d
+  backward : pad2d_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : pad3d
+  backward : pad3d_grad, pad3d_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  int_array:
+    paddings :
+      data_type : int
+      tensor_name : Paddings
+  attrs :
+    pad_value : value
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : partial_sum
+  backward : partial_sum_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : pixel_shuffle
+  backward : pixel_shuffle_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : pixel_unshuffle
+  backward : pixel_unshuffle_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : poisson
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : pool2d
+  backward : pool2d_grad, pool2d_double_grad
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out}
+  attrs :
+    {kernel_size : ksize}
+  int_array:
+    kernel_size :
+      data_type : int
+      support_tensor : true
+  get_expected_kernel_type :
+    pool2d : GetPoolExpectedKernelType
+    pool2d_grad : GetPoolExpectedKernelType
+    pool2d_double_grad : GetPoolDoubleGradExpectedKernelType
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_quantizer = false,
+              str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : pool3d
+  backward : pool3d_grad
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out}
+  attrs :
+    {kernel_size : ksize}
+  get_expected_kernel_type :
+    pool3d : GetPoolExpectedKernelType
+    pool3d_grad : GetPoolExpectedKernelType
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : pow
+  backward : pow_grad, pow_double_grad, pow_triple_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    y : factor
+  scalar :
+    y :
+      data_type : float
+      tensor_name : FactorTensor
+
+- op : prelu
+  backward : prelu_grad
+  inputs :
+    { x : X, alpha : Alpha}
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : print
+  inputs :
+    in : In
+  outputs :
+    out : Out
+
+- op : prior_box
+  inputs :
+    {input: Input, image: Image}
+  outputs :
+    {out: Boxes, var: Variances}
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
+
+- op : prod (reduce_prod)
+  backward : prod_grad (reduce_prod_grad)
+  inputs:
+    x : X
+  outputs:
+    out : Out
+  attrs:
+    { dims : dim,  keep_dim : keep_dim}
+  int_array:
+    dims :
+      data_type : int
+      support_tensor : true
+  extra :
+    attrs : [bool use_mkldnn = false]
+  get_expected_kernel_type :
+    prod : GetReduceExpectedKernelType
+    prod_grad : GetReduceGradExpectedKernelType
+  manual_signature : [prod]
+
+- op : psroi_pool
+  backward : psroi_pool_grad
+  inputs :
+    {x : X, boxes : ROIs, boxes_num : RoisNum}
+  outputs :
+    out : Out
+
+- op : push_sparse_v2
+  inputs :
+    { x : Ids, W : w}
+  outputs :
+    out : Out
+  extra :
+    attrs : [int embeddingdim = 11, int tableid = 0, str accessorclass = "", str ctrlabelname = "", int paddingid = 0, bool scalesparsegrad = true, 'str[] inputnames = {}', bool is_distributed = true]
+
+- op : put_along_axis
+  backward : put_along_axis_grad
+  inputs :
+    {arr : Input, indices : Index, values : Value}
+  outputs :
+    out : Result
+  attrs :
+    {axis : Axis, reduce : Reduce, include_self: Include_self}
+
+- op : pylayer
+  backward : pylayer_grad
+  extra :
+    attrs : ['str[] skip_eager_deletion_vars = {}']
+
+- op : qr
+  backward : qr_grad
+  inputs :
+    x : X
+  outputs :
+    {q : Q, r : R}
+
+- op : quantize
+  inputs :
+    input : Input
+  outputs :
+    output : Output
+  attrs :
+    {scale : Scale, shift : Shift, include_self: Include_self}
+
+- op : quantize_linear
+  extra :
+    attrs : [float moving_rate = 0.9]
+
+- op : randint
+  outputs :
+    out : Out
+  int_array:
+    shape :
+      data_type : int64_t
+      tensor_name : ShapeTensor
+      tensors_name : ShapeTensorList
+  manual_signature : [randint]
+
+- op : randperm
+  outputs :
+    out : Out
+  extra :
+    attrs : [int seed = 0]
+
+- op : real
+  backward : real_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : reciprocal
+  backward : reciprocal_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : relu
+  backward : relu_grad, relu_double_grad (relu_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : relu6
+  backward : relu6_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, float threshold = 6.0]
+
+- op : remainder (elementwise_mod)
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    {out : Out}
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+  manual_signature : [remainder]
+
+- op : renorm
+  backward : renorm_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : repeat_interleave
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    repeats : Repeats
+
+- op : repeat_interleave
+  backward : repeat_interleave_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    {repeats : Repeats, axis : dim}
+
+- op : repeat_interleave_with_tensor_index
+  backward : repeat_interleave_with_tensor_index_grad
+  inputs :
+    {x : X, repeats: RepeatTensor}
+  outputs:
+    out : Out
+  attrs:
+    axis : dim
+
+- op : requantize
+  inputs :
+    input : Input
+  outputs :
+    output : Output
+  attrs :
+    {scale_in : Scale_in, scale_out : Scale_out, shift_in : Shift_in, shift_out : Shift_out}
+
+- op : reshape (reshape2)
+  backward : reshape_grad (reshape2_grad)
+  inputs:
+    x : X
+  outputs:
+    out : Out
+    xshape: XShape
+  int_array:
+    shape :
+      data_type : int
+      tensor_name : Shape
+      tensors_name : ShapeTensor
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
+
+- op : reverse
+  inputs:
+    x : X
+  outputs:
+    out : Out
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
+  manual_signature : [reverse]
+
+- op : rmsprop_ (rmsprop)
+  inputs :
+    {param: Param, mean_square: MeanSquare, mean_grad: MeanGrad, learning_rate: LearningRate, grad: Grad, moment: Moment, master_param: MasterParam}
+  outputs :
+    {param_out: ParamOut, moment_out: MomentOut, mean_square_out: MeanSquareOut, mean_grad_out: MeanGradOut, master_param_outs: MasterParamOut}
+
+- op : rnn
+  backward : rnn_grad
+  inputs:
+    { x : Input, pre_state : PreState, weight_list : WeightList, sequence_length : SequenceLength}
+  outputs:
+    { out : Out, dropout_state_out : DropoutState, state : State, reserve : Reserve}
+  drop_empty_grad : [pre_state_grad, weight_list_grad]
+
+- op : roi_align
+  backward : roi_align_grad
+  inputs :
+    {x : X, boxes : ROIs, boxes_num : RoisNum}
+  outputs :
+    out : Out
+
+- op : roi_pool
+  backward : roi_pool_grad
+  inputs :
+    {x : X, boxes : ROIs, boxes_num : RoisNum}
+  outputs :
+    {out : Out, arg_max : Argmax}
+
+- op : roll
+  backward : roll_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  int_array :
+    shifts :
+      data_type : int64_t
+      tensor_name : ShiftsTensor
+
+- op : round
+  backward : round_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : row_conv
+  backward : row_conv_grad
+  inputs :
+    {x : X, filter : Filter}
+  outputs :
+    {out : Out}
+
+- op : rsqrt
+  backward : rsqrt_grad, rsqrt_double_grad (rsqrt_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : save_combine
+  inputs :
+    {x : X}
+
+- op : scale
+  backward : scale_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  scalar :
+    scale :
+      data_type : float
+      tensor_name : ScaleTensor
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : scatter
+  backward : scatter_grad
+  inputs :
+    {x : X, index : Ids, updates : Updates}
+  outputs :
+    out : Out
+
+- op : scatter_nd_add
+  backward : scatter_nd_add_grad
+  inputs :
+    {x : X, index : Index, updates : Updates}
+  outputs :
+    out : Out
+
+- op : searchsorted
+  inputs :
+    {sorted_sequence : SortedSequence, values : Values}
+  outputs :
+    out : Out
+
+- op : seed
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
+
+- op : segment_pool
+  backward : segment_pool_grad
+  inputs :
+    {x : X, segment_ids : SegmentIds}
+  outputs :
+    {out : Out, summed_ids : SummedIds}
+
+- op : self_dp_attention
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : selu
+  backward : selu_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : send_u_recv(graph_send_recv)
+  backward : send_u_recv_grad(graph_send_recv_grad)
+  inputs :
+    {x : X, src_index : Src_index, dst_index : Dst_index}
+  outputs :
+    {out : Out, dst_count : Dst_count}
+  int_array :
+    out_size:
+      data_type : int64_t
+      tensor_name : Out_size
+
+- op : send_ue_recv(graph_send_ue_recv)
+  backward : send_ue_recv_grad(graph_send_ue_recv_grad)
+  inputs :
+    {x : X, y : Y, src_index : Src_index, dst_index : Dst_index}
+  outputs :
+    {out : Out, dst_count : Dst_count}
+  int_array :
+    out_size:
+      data_type : int64_t
+      tensor_name : Out_size
+
+- op : send_uv (graph_send_uv)
+  backward : send_uv_grad (graph_send_uv_grad)
+
+- op : sequence_mask
+  inputs:
+    x : X
+  attrs:
+    max_len: maxlen
+  outputs:
+    y : Y
+  scalar :
+    max_len :
+      data_type : int
+      tensor_name : MaxLenTensor
+
+- op : sequence_softmax
+  backward : sequence_softmax_grad
+  extra :
+    attrs : [str data_format = "AnyLayout"]
+
+- op : sgd_ (sgd)
+  inputs :
+    {param : Param, learning_rate : LearningRate, grad : Grad, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, master_param_out : MasterParamOut}
+  get_expected_kernel_type :
+    sgd_ : GetSgdExpectedKernelType
+  extra :
+    attrs : [bool use_mkldnn=false]
+
+- op : shape
+  inputs :
+    input : Input
+  outputs :
+    out : Out
+
+- op : shape
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : shard_index
+  inputs :
+    input : X
+  outputs :
+    out : Out
+
+- op : share_buffer
+  inputs :
+    x : X
+  outputs :
+    out : Out
+    xout : XOut
+
+- op : shuffle_batch
+  backward: shuffle_batch_grad
+  inputs:
+    {x : X, seed : Seed}
+  outputs:
+    {out : Out, shuffle_idx : ShuffleIdx, seed_out : SeedOut}
+
+- op : shuffle_channel
+  backward : shuffle_channel_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : sigmoid
+  backward : sigmoid_grad, sigmoid_double_grad (sigmoid_grad_grad), sigmoid_triple_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : sign
+  backward : sign_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : silu
+  backward : silu_grad, silu_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : sin
+  backward : sin_grad, sin_double_grad, sin_triple_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : sinh
+  backward : sinh_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : slice
+  backward : slice_grad
+  inputs :
+    input : Input
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+  int_array :
+    starts :
+      data_type : int
+      tensor_name : StartsTensor
+      tensors_name : StartsTensorList
+    ends :
+      data_type : int
+      tensor_name : EndsTensor
+      tensors_name : EndsTensorList
+
+- op : slogdet(slogdeterminant)
+  backward : slogdet_grad(slogdeterminant_grad)
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- op : soft_relu
+  backward : soft_relu_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : softmax
+  backward : softmax_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  get_expected_kernel_type :
+    softmax : GetSoftmaxExpectedKernelType
+    softmax_grad : GetSoftmaxGradExpectedKernelType
+  extra :
+    attrs : [str data_format = "AnyLayout", bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : softplus
+  backward : softplus_grad, softplus_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : softshrink
+  backward : softshrink_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs :
+    threshold : lambda
+
+- op : softsign
+  backward : softsign_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : solve
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : spectral_norm
+  backward : spectral_norm_grad
+  inputs :
+    {weight : Weight, u : U, v : V}
+  outputs :
+    out : Out
+
+- op : split
+  backward : split_grad
+  inputs:
+    x : X
+  outputs:
+    out : Out
+  int_array:
+      sections :
+          data_type : int
+          support_tensor : true
+  scalar :
+      axis :
+        data_type : int
+        support_tensor : true
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : split_with_num
+  scalar :
+      axis :
+        data_type : int
+        support_tensor : true
+        tensor_name : AxisTensor
+
+- op : sqrt
+  backward : sqrt_grad, sqrt_double_grad (sqrt_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : square
+  backward : square_grad, square_double_grad (square_grad_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : squeeze (squeeze2)
+  backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   axis : axes
+  outputs :
+    {out : Out, xshape : XShape}
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    outputs : [xshape]
+
+- op : stack
+  backward : stack_grad
+  inputs :
+    x : X
+  outputs :
+    out : Y
+  extra :
+    attrs : [bool use_mkldnn = false]
+  drop_empty_grad : [x_grad]
+
+- op : stanh
+  backward : stanh_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : strided_slice
+  backward : strided_slice_grad
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+  int_array :
+    starts :
+      data_type : int
+      tensor_name : StartsTensor
+      tensors_name : StartsTensorList
+    ends :
+      data_type : int
+      tensor_name : EndsTensor
+      tensors_name : EndsTensorList
+    strides :
+      data_type : int
+      tensor_name : StridesTensor
+      tensors_name : StridesTensorList
+  manual_signature : [strided_slice, strided_slice_grad]
+  get_expected_kernel_type :
+    strided_slice : GetStridedSliceExpectedKernelType
+    strided_slice_grad : GetStridedSliceGradExpectedKernelType
+
+- op : subtract (elementwise_sub)
+  backward : subtract_grad (elementwise_sub_grad)
+  inputs :
+    {x : X, y: Y}
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+  complex_promote : [X, Y]
+
+- op : sum (reduce_sum)
+  backward : sum_grad (reduce_sum_grad), sum_double_grad
+  inputs:
+    {x : X}
+  outputs:
+    out : Out
+  attrs:
+    { axis : dim,  keepdim : keep_dim, dtype : out_dtype}
+  extra :
+    attrs : [bool use_mkldnn = false]
+  int_array:
+      axis :
+        data_type : int
+        support_tensor : true
+  get_expected_kernel_type :
+    sum : GetReduceExpectedKernelType
+    sum_grad : GetReduceGradExpectedKernelType
+  manual_signature : [sum]
+
+- op : svd
+  backward : svd_grad
+  inputs :
+    x : X
+  outputs :
+    {u : U, s : S, vh : VH}
+
+- op : swish
+  backward : swish_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, float beta = 1.0]
+
+- op : sync_batch_norm
+  inputs :
+    {x : X, scale : Scale, bias : Bias, mean : Mean, variance : Variance}
+  outputs :
+    {out : Y, mean_out : MeanOut, variance_out : VarianceOut, saved_mean : SavedMean, saved_variance : SavedVariance, reserve_space : ReserveSpace}
+  backward : sync_batch_norm_grad
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+
+- op : take_along_axis
+  backward : take_along_axis_grad
+  inputs :
+    {arr : Input, indices : Index}
+  outputs :
+    out : Result
+  attrs :
+    axis : Axis
+
+- op : tan
+  backward : tan_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : tanh
+  backward : tanh_grad, tanh_double_grad (tanh_grad_grad), tanh_triple_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : tanh_shrink
+  backward : tanh_shrink_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : tdm_sampler
+  inputs:
+    {x : X, travel : Travel, layer : Layer}
+  outputs:
+    {out : Out, labels : Labels, mask : Mask}
+
+- op : thresholded_relu
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : tile
+  backward : tile_grad, tile_double_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  int_array:
+    repeat_times :
+      data_type : int
+      tensor_name : RepeatTimes
+      tensors_name : repeat_times_tensor
+
+- op : topk (top_k_v2)
+  backward : topk_grad (top_k_v2_grad)
+  inputs :
+    x : X
+  outputs :
+    {out : Out, indices : Indices}
+  scalar :
+    k :
+      data_type : int
+      tensor_name : K
+
+- op : trace
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- op : transpose (transpose2)
+  backward : transpose_grad (transpose2_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+  attrs:
+    perm : axis
+  extra :
+    outputs : [XShape]
+    attrs : [bool use_mkldnn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"]
+
+- op : triangular_solve
+  backward : triangular_solve_grad
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : tril_triu
+  backward : tril_triu_grad
+  inputs :
+    {x: X}
+  outputs :
+    {out : Out}
+
+- op : trilinear_interp (trilinear_interp_v2)
+  backward : trilinear_interp_grad (trilinear_interp_v2_grad)
+  inputs :
+    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
+  outputs :
+    output : Out
+  attrs:
+    data_format: data_layout
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : trunc
+  inputs :
+    input : X
+  outputs :
+    out : Out
+
+- op : truncated_gaussian_random
+  outputs :
+    out : Out
+
+- op : unbind
+  inputs :
+    input : X
+  outputs :
+    out : Out
+
+- op : unfold
+  inputs :
+    x : X
+  outputs :
+    out : Y
+
+- op : uniform (uniform_random)
+  outputs :
+    out : Out
+  int_array :
+    shape :
+      data_type : int64_t
+      tensor_name : ShapeTensor
+      tensors_name : ShapeTensorList
+  scalar :
+      min :
+        data_type : float
+        support_tensor : true
+      max :
+        data_type : float
+        support_tensor : true
+  manual_signature : [uniform]
+
+- op : uniform_inplace (uniform_random_inplace)
+  backward : uniform_inplace_grad(uniform_random_inplace_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : unique
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out, indices : Indices, inverse : Index, counts : Counts}
+  get_expected_kernel_type :
+    unique : GetUniqueExpectedKernelType
+  manual_signature : [unique]
+
+- op : unique_consecutive
+  inputs :
+    x : X
+  outputs :
+    {out : Out, index : Index, counts : Counts}
+
+- op : unpool
+  inputs :
+    {x : X, indices: Indices}
+  outputs :
+    out : Out
+  attrs :
+   padding : paddings
+  int_array :
+    output_size:
+      data_type : int
+      support_tensor : true
+
+- op : unpool3d
+  inputs :
+    {x : X, indices: Indices}
+  outputs :
+    out : Out
+
+- op : unsqueeze (unsqueeze2)
+  backward : unsqueeze_grad (unsqueeze2_grad), unsqueeze_double_grad(unsqueeze2_double_grad)
+  inputs :
+    x : X
+  attrs :
+   axis : axes
+  outputs :
+    {out : Out, xshape : XShape}
+  int_array:
+    axis :
+      data_type : int
+      tensor_name : AxesTensor
+      tensors_name : AxesTensorList
+  extra :
+    outputs : [xshape]
+
+- op : unstack
+  backward : unstack_grad
+  inputs :
+    x : X
+  outputs :
+    out : Y
+
+- op : update_loss_scaling_(update_loss_scaling)
+  inputs :
+    {x : X, found_infinite : FoundInfinite, prev_loss_scaling : PrevLossScaling, in_good_steps : InGoodSteps, in_bad_steps : InBadSteps}
+  outputs :
+    {out : Out, loss_scaling : LossScaling, out_good_steps : OutGoodSteps, out_bad_steps : OutBadSteps}
+  scalar :
+    stop_update :
+      data_type : bool
+      tensor_name : StopUpdate
+  get_expected_kernel_type :
+    update_loss_scaling_ : GetUpdateLossScalingExpectedKernelType
+
+- op : viterbi_decode
+  inputs :
+    {potentials : Input, transition_params : Transition, lengths : Length}
+  outputs :
+    {scores : Scores, path : Path}
+
+- op : warpctc
+  backward : warpctc_grad
+  inputs :
+    {logits : Logits, label : Label, logits_length : LogitsLength, labels_length : LabelLength}
+  outputs :
+    {warpctcgrad : WarpCTCGrad, loss : Loss}
+
+- op : where
+  backward : where_grad
+  inputs :
+    {condition : Condition, x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : while
+  backward : while_grad
+  extra :
+    attrs : ['str[] skip_eager_deletion_vars = {}']
+
+- op : yolo_box
+  inputs :
+    {x : X, img_size : ImgSize}
+  outputs :
+    {boxes : Boxes, scores : Scores}
+
+- op : yolo_loss (yolov3_loss)
+  backward: yolo_loss_grad (yolov3_loss_grad)
+  inputs :
+    {x : X,  gt_box : GTBox, gt_label : GTLabel ,gt_score : GTScore}
+  outputs :
+    {loss : Loss , objectness_mask : ObjectnessMask, gt_match_mask : GTMatchMask}
+  get_expected_kernel_type :
+    yolo_loss : GetYoloLossExpectedKernelType
+    yolo_loss_grad : GetYoloLossExpectedKernelType
+
+- op: c_allgather
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_allreduce_max
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_allreduce_min
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_allreduce_prod
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_allreduce_sum
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_broadcast
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: c_identity
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_reduce_min
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_reduce_sum
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_reducescatter
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_sync_calc_stream
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: c_sync_comm_stream
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: channel_shuffle
+  inputs:
+    {x: X}
+  outputs:
+    {out: Out}
+
+- op: decayed_adagrad
+  inputs:
+    {param : Param, grad : Grad, moment : Moment, learning_rate : LearningRate}
+  outputs:
+    {param_out : ParamOut, moment_out : MomentOut}
+
+- op: distribute_fpn_proposals
+  inputs :
+    {fpn_rois: FpnRois, rois_num: RoisNum}
+  outputs :
+    multi_fpn_rois : MultiFpnRois
+    multi_level_rois_num: MultiLevelRoIsNum
+    restore_index: RestoreIndex
+
+- op: distributed_lookup_table
+  inputs:
+    {ids: Ids, w: W}
+  outputs:
+    outputs: Outputs
+
+- op: dpsgd
+  inputs:
+    {param: Param,grad: Grad,learning_rate: LearningRate}
+  outputs:
+    param_out : ParamOut
+
+- op: fetch (fetch_v2)
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op: ftrl
+  inputs:
+    {param: Param, squared_accumulator: SquaredAccumulator, linear_accumulator: LinearAccumulator, grad: Grad, learning_rate: LearningRate}
+  outputs:
+    {param_out: ParamOut, squared_accum_out: SquaredAccumOut, linear_accum_out: LinearAccumOut}
+
+- op: full_batch_size_like (fill_constant_batch_size_like)
+  inputs:
+    {input: Input}
+  outputs:
+    {out: Out}
+
+- op: fused_elemwise_add_activation
+  backward: fused_elemwise_add_activation_grad
+  inputs :
+    {x: X, y: Y}
+  outputs :
+    {out : Out, intermediate_out : IntermediateOut}
+
+- op: fusion_squared_mat_sub
+  inputs :
+    x : X
+    y : Y
+  outputs :
+    squared_x : SquaredX
+    squared_y : SquaredY
+    squared_xy : SquaredXY
+    out : Out
+
+- op: get_tensor_from_selected_rows
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: identity_loss
+  inputs :
+    x: X
+  outputs :
+    out : Out
+
+- op: lars_momentum
+  inputs:
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
+  outputs :
+    {param_out: ParamOut, velocity_out: VelocityOut, master_param_out: MasterParamOut}
+
+- op: lod_array_length
+  inputs :
+    {x: X}
+  outputs :
+    out : Out
+
+- op: logspace
+  inputs:
+    {start: Start, stop: Stop, num: Num, base: Base}
+  outputs:
+    {out: Out}
+
+- op: lu
+  backward: lu_grad
+  inputs:
+    x: X
+  outputs:
+    {out: Out, pivots : Pivots, infos : Infos}
+  attrs:
+    pivot : pivots
+
+- op: match_matrix_tensor
+  backward: match_matrix_tensor_grad
+  inputs:
+    {x : X, y : Y, w : W}
+  outputs:
+    {out : Out, tmp : Tmp}
+
+- op: memcpy
+  inputs:
+    x: X
+  outputs:
+    out: Out
+
+- op: memcpy_d2h
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: nce
+  backward: nce_grad
+  inputs:
+    {input : Input, label : Label, weight : Weight, bias : Bias, sample_weight : SampleWeight, custom_dist_probs : CustomDistProbs, custom_dist_alias : CustomDistAlias, custom_dist_alias_probs : CustomDistAliasProbs}
+  outputs:
+    {cost : Cost, sample_logits : SampleLogits, sample_labels : SampleLabels}
+
+- op: number_count
+  inputs :
+    {numbers: numbers}
+  outputs :
+    out : Out
+
+- op: read_from_array
+  inputs:
+    array : X
+    i : I
+  outputs :
+    out : Out
+
+- op: recv_v2
+  outputs :
+    out : Out
+
+- op: reindex_graph (graph_reindex)
+  inputs :
+    {x : X, neighbors : Neighbors, count : Count, hashtable_value : HashTable_Value, hashtable_index : HashTable_Index}
+  outputs :
+    {reindex_src : Reindex_Src, reindex_dst : Reindex_Dst, out_nodes : Out_Nodes}
+
+- op: rrelu
+  inputs:
+    {x: X}
+  outputs:
+    {out: Out, noise: Noise}
+
+- op: send_v2
+  inputs :
+    x : X
+
+- op: set_value
+  backward: set_value_grad
+  inputs:
+    x : Input
+  outputs:
+    out: Out
+  int_array:
+    starts:
+      data_type : int64_t
+      tensors_name : StartsTensorList
+    ends:
+      data_type : int64_t
+      tensors_name : EndsTensorList
+    steps:
+      data_type : int64_t
+      tensors_name : StepsTensorList
+
+- op: set_value_with_tensor
+  backward: set_value_grad
+  inputs:
+    x : Input
+  outputs:
+    out: Out
+  int_array:
+    starts:
+      data_type : int64_t
+      tensors_name : StartsTensorList
+    ends:
+      data_type : int64_t
+      tensors_name : EndsTensorList
+    steps:
+      data_type : int64_t
+      tensors_name : StepsTensorList
+
+- op: share_data
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: sigmoid_cross_entropy_with_logits
+  backward: sigmoid_cross_entropy_with_logits_grad
+  inputs :
+    {x: X, label: Label}
+  outputs :
+    out : Out
+
+- op: skip_layernorm
+  inputs :
+    {x: X, y: Y, scale: Scale, bias : Bias}
+  outputs :
+    out : Out
+
+- op: sparse_momentum
+  inputs :
+    {param: Param, grad: Grad, velocity: Velocity, index: Index, axis: Axis, learning_rate: LearningRate,master_param: MasterParam}
+  outputs :
+    {param_out: ParamOut, velocity_out: VelocityOut, master_param_out: MasterParamOut}
+  scalar:
+   axis:
+     datatype : int
+     tensor_name : Axis
+
+- op: squared_l2_norm
+  backward: squared_l2_norm_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: temporal_shift
+  backward: temporal_shift_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op: uniform_random_batch_size_like
+  inputs:
+     input : Input
+  outputs:
+     out: Out
+
+- op: write_to_array
+  inputs :
+    {x: X, i: I}
+  outputs :
+    out : Out
diff --git a/test/legacy_test/test_fusion_gru_op.py b/test/legacy_test/test_fusion_gru_op.py
index a86fd9b1f7b7ce..f36a1fd4a72cb9 100644
--- a/test/legacy_test/test_fusion_gru_op.py
+++ b/test/legacy_test/test_fusion_gru_op.py
@@ -114,7 +114,9 @@ def setUp(self):
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False)
+            self.check_output(
+                check_dygraph=False, check_pir_onednn=self.check_pir_onednn
+            )
 
 
 class TestFusionGRUOpNoInitial(TestFusionGRUOp):
diff --git a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index 93b141f9eefca5..ae44798dce4eb3 100644
--- a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -32,7 +32,9 @@ def set_confs(self):
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False)
+            self.check_output(
+                check_dygraph=False, check_pir_onednn=self.check_pir_onednn
+            )
 
     def setUp(self):
         self.op_type = "fusion_gru"
diff --git a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
index 352ce5bc5db5fb..b7b775de2581e3 100644
--- a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -150,7 +150,11 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False, atol=self.error_margin)
+        self.check_output(
+            check_dygraph=False,
+            atol=self.error_margin,
+            check_pir_onednn=self.check_pir_onednn,
+        )
 
 
 class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp):
diff --git a/test/mkldnn/test_fusion_gru_mkldnn_op.py b/test/mkldnn/test_fusion_gru_mkldnn_op.py
index 9e619b73ff1793..112c7c1389dc02 100644
--- a/test/mkldnn/test_fusion_gru_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_mkldnn_op.py
@@ -20,30 +20,35 @@
 class TestFusionGRUMKLDNNOp(TestFusionGRUOp):
     def set_confs(self):
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpNoInitial(TestFusionGRUOp):
     def set_confs(self):
         self.with_h0 = False
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpNoBias(TestFusionGRUOp):
     def set_confs(self):
         self.with_bias = False
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpReverse(TestFusionGRUOp):
     def set_confs(self):
         self.is_reverse = True
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpOriginMode(TestFusionGRUOp):
     def set_confs(self):
         self.origin_mode = True
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpMD1(TestFusionGRUOp):
@@ -51,6 +56,7 @@ def set_confs(self):
         self.M = 36
         self.D = 8
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpMD2(TestFusionGRUOp):
@@ -58,6 +64,7 @@ def set_confs(self):
         self.M = 8
         self.D = 8
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpMD3(TestFusionGRUOp):
@@ -65,6 +72,7 @@ def set_confs(self):
         self.M = 17
         self.D = 15
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
@@ -72,6 +80,7 @@ def set_confs(self):
         self.lod = [[3]]
         self.D = 16
         self.use_mkldnn = True
+        self.check_pir_onednn = True
 
 
 if __name__ == "__main__":
diff --git a/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py b/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
index a67dd64a4fbd4f..5711ad4436acea 100644
--- a/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
@@ -23,6 +23,7 @@
     TestLayerNormMKLDNNOp,
     _reference_layer_norm_naive,
 )
+from utils import compare_legacy_with_pt
 
 from paddle import base, enable_static
 from paddle.base import core
@@ -132,6 +133,7 @@ def check_forward(
                 self.__assert_close(mean, out[1], "mean")
                 self.__assert_close(variance, out[2], "variance", 1e-3)
 
+    @compare_legacy_with_pt
     def test_check_forward_with_is_test(self):
         self.check_forward(
             shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True
diff --git a/test/mkldnn/test_layer_norm_mkldnn_op.py b/test/mkldnn/test_layer_norm_mkldnn_op.py
index c225469e71cc80..c53687872307c6 100644
--- a/test/mkldnn/test_layer_norm_mkldnn_op.py
+++ b/test/mkldnn/test_layer_norm_mkldnn_op.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 from op_test import OpTestTool, _set_use_system_allocator
+from utils import compare_legacy_with_pt
 
 from paddle import base, enable_static
 from paddle.base import core
@@ -143,17 +144,21 @@ def check_forward(
                 self.__assert_close(variance, out[2], "variance", 1e-3)
 
     @OpTestTool.skip_if_not_cpu_bf16()
+    @compare_legacy_with_pt
     def test_check_forward_non_last_begin_norm_axis(self):
         self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=2)
 
+    @compare_legacy_with_pt
     def test_check_forward_with_scale_and_bias(self):
         self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
+    @compare_legacy_with_pt
     def test_check_forward_without_scale_and_bias(self):
         self.check_forward(
             shape=[2, 3, 4, 5], begin_norm_axis=3, with_scale_bias=False
         )
 
+    @compare_legacy_with_pt
     def test_check_forward_with_is_test(self):
         self.check_forward(
             shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True
diff --git a/test/mkldnn/test_matmul_v2_mkldnn_op.py b/test/mkldnn/test_matmul_v2_mkldnn_op.py
index 8c9fb2e0928354..42c592cca9bdf0 100644
--- a/test/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -83,10 +83,12 @@ def setUp(self):
         self.outputs = {'Out': result}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_pir_onednn=True, check_dygraph=False)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_pir_onednn=True, check_dygraph=False
+        )
 
 
 class TestMatMulV2VectorXMatrixTransposeYOneDNNOp(
@@ -313,7 +315,9 @@ def set_dtype_attr(self):
             self.attrs['mkldnn_data_type'] = "bfloat16"
 
         def test_check_output(self):
-            self.check_output_with_place(core.CPUPlace())
+            self.check_output_with_place(
+                core.CPUPlace(), check_pir_onednn=True, check_dygraph=False
+            )
 
         def test_check_grad(self):
             self.calculate_grads()
@@ -323,6 +327,8 @@ def test_check_grad(self):
                 "Out",
                 user_defined_grads=[self.dx, self.dy],
                 user_defined_grad_outputs=[convert_float_to_uint16(self.dout)],
+                check_pir_onednn=True,
+                check_dygraph=False,
             )
 
         def matmul_grad(self, x, transpose_x, y, transpose_y):

From cf0efb4f96e686dd287ceebaf7d2463e1725308d Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Tue, 6 Feb 2024 08:11:10 +0000
Subject: [PATCH 3/9] refine

---
 test/legacy_test/op_compat.yaml | 3758 -------------------------------
 1 file changed, 3758 deletions(-)
 delete mode 100755 test/legacy_test/op_compat.yaml

diff --git a/test/legacy_test/op_compat.yaml b/test/legacy_test/op_compat.yaml
deleted file mode 100755
index 026b8ca617e593..00000000000000
--- a/test/legacy_test/op_compat.yaml
+++ /dev/null
@@ -1,3758 +0,0 @@
-# All the configuration in this file are only for existing operators,
-# which cannot be modified in principle. There's no need to configure
-# this file for new operator.
-#
-# This file is used for two purposes:
-# 1. Configure the mapping relationship of parameter names of operator
-#    between the operators in ops.yaml and the old operators defined
-#    in fluid.
-# 2. Save the extra parameters in the OpMaker of operators temporarily,
-#    which will be removed in the future.
-
-# - op : rnn
-#   backward : rnn_grad
-#   extra :
-#     attrs : [bool is_test = false]
-
-- op : abs
-  backward : abs_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : accuracy
-  inputs :
-    {x : Out , indices : Indices, label: Label}
-  outputs :
-    {accuracy : Accuracy, correct : Correct, total : Total}
-
-- op : acos
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : acosh
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  backward : acosh_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : adadelta_ (adadelta)
-  inputs :
-    {param : Param, grad: Grad, avg_squared_grad : AvgSquaredGrad, avg_squared_update : AvgSquaredUpdate, learning_rate : LearningRate, master_param : MasterParam }
-  outputs :
-    {param_out : ParamOut, moment_out : AvgSquaredGradOut, inf_norm_out : AvgSquaredUpdateOut, master_param_out : MasterParamOut}
-
-- op : adagrad_ (adagrad)
-  inputs :
-    { param : Param, grad : Grad, moment : Moment, learning_rate : LearningRate, master_param : MasterParam }
-  outputs :
-    { param_out : ParamOut, moment_out : MomentOut, master_param_out : MasterParamOut }
-
-- op : adam_ (adam)
-  inputs :
-    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
-  outputs :
-    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
-  scalar :
-    beta1 :
-      data_type : float
-      tensor_name : Beta1Tensor
-    beta2 :
-      data_type : float
-      tensor_name : Beta2Tensor
-    epsilon :
-      data_type : float
-      tensor_name : EpsilonTensor
-  manual_signature : [adam_]
-
-- op : adamax_ (adamax)
-  inputs :
-    {param : Param, grad: Grad, learning_rate : LearningRate, moment : Moment, inf_norm : InfNorm, beta1_pow : Beta1Pow, master_param : MasterParam}
-  outputs :
-    {param_out : ParamOut, moment_out : MomentOut, inf_norm_out : InfNormOut, master_param_out : MasterParamOut}
-
-- op : adamw_ (adamw)
-  inputs :
-    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
-  outputs :
-    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
-  scalar :
-    beta1 :
-      data_type : float
-      tensor_name : Beta1Tensor
-    beta2 :
-      data_type : float
-      tensor_name : Beta2Tensor
-    epsilon :
-      data_type : float
-      tensor_name : EpsilonTensor
-
-- op : add (elementwise_add)
-  backward : add_grad (elementwise_add_grad), add_double_grad (elementwise_add_grad_grad), add_triple_grad (elementwise_add_triple_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  attrs :
-    {scale_x : Scale_x, scale_y : Scale_y, scale_out : Scale_out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-
-- op : add_n (sum)
-  inputs:
-    {inputs : X}
-  outputs:
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- op : addmm
-  backward : addmm_grad
-  inputs :
-    {input : Input, x : X, y : Y}
-  outputs :
-    out : Out
-  attrs :
-    {alpha : Alpha, beta : Beta}
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : affine_grid
-  backward : affine_grid_grad
-  inputs :
-    input : Theta
-  outputs :
-    output : Output
-  int_array:
-    output_shape :
-      data_type : int
-      tensor_name : OutputShape
-  extra :
-    attrs : [bool use_cudnn = true]
-
-- op : all (reduce_all)
-  inputs:
-    x : X
-  attrs:
-    { axis : dim,  keepdim : keep_dim}
-  outputs:
-    out : Out
-  manual_signature : [all]
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : allclose
-  inputs :
-    {x : Input, y : Other}
-  outputs :
-    out : Out
-  scalar :
-    rtol :
-      data_type : std::string
-      tensor_name : Rtol
-    atol :
-      data_type : std::string
-      tensor_name : Atol
-
-- op : amax (reduce_amax)
-  backward : amax_grad (reduce_amax_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs:
-    { axis : dim,  keepdim : keep_dim }
-  extra :
-    attrs : [bool use_mkldnn = false]
-  get_expected_kernel_type :
-    amax_grad : GetReduceGradExpectedKernelType
-  manual_signature : [amax]
-
-- op : amin (reduce_amin)
-  backward : amin_grad (reduce_amin_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs:
-    { axis : dim,  keepdim : keep_dim }
-  extra :
-    attrs : [bool use_mkldnn = false]
-  get_expected_kernel_type :
-    amin_grad : GetReduceGradExpectedKernelType
-  manual_signature : [amin]
-
-- op : angle
-  backward : angle_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : any (reduce_any)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs:
-    { axis : dim,  keepdim : keep_dim }
-  extra :
-    attrs : [bool use_mkldnn = false]
-  get_expected_kernel_type :
-    any : GetReduceOpUseInputPlaceExpectedKernelType
-  manual_signature : [any]
-
-- op : arange(range)
-  inputs :
-    {start : Start, end : End, step : Step}
-  outputs :
-    out : Out
-
-- op : argmax(arg_max)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar:
-    axis:
-      data_type : int64_t
-      support_tensor : true
-
-- op : argmin(arg_min)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar:
-    axis:
-      data_type : int64_t
-      support_tensor : true
-
-- op : argsort
-  inputs :
-    x : X
-  outputs :
-    out : Out
-    indices : Indices
-
-- op : array_to_tensor(tensor_array_to_tensor)
-  backward : tanh_shrink_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-    out_index : OutIndex
-
-- op : as_complex
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : as_real
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : asin
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : asinh
-  backward : asinh_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : assert
-  inputs :
-    {cond : Cond, data : Data}
-
-- op : assign
-  backward : assign_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  manual_signature : [assign, assign_grad]
-  get_expected_kernel_type :
-    assign : GetAssignExpectedKernelType
-
-- op : assign_value
-  outputs :
-    out : Out
-  manual_signature : [assign_value]
-
-- op : atan
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : atan2
-  backward : atan2_grad
-  inputs :
-    {x : X1, y : X2}
-  outputs :
-    out : Out
-
-- op : atanh
-  backward : atanh_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : auc
-  inputs :
-    {x : Predict, label : Label, stat_pos : StatPos, stat_neg : StatNeg, ins_tag_weight : InsTagWeight}
-  outputs :
-    {auc : AUC, stat_pos_out : StatPosOut, stat_neg_out : StatNegOut}
-
-- op : batch_norm
-  backward : batch_norm_grad, batch_norm_double_grad(batch_norm_grad_grad)
-  inputs:
-    x : X
-    mean : Mean
-    variance : Variance
-    scale : Scale
-    bias : Bias
-  outputs :
-    out : Y
-    mean_out: MeanOut
-    variance_out: VarianceOut
-    saved_mean: SavedMean
-    saved_variance: SavedVariance
-    reserve_space: ReserveSpace
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
-
-- op : bce_loss
-  backward : bce_loss_grad
-  inputs :
-    {input : X, label : Label}
-  outputs :
-    out : Out
-
-- op : bernoulli
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : bicubic_interp (bicubic_interp_v2)
-  backward : bicubic_interp_grad (bicubic_interp_v2_grad)
-  inputs :
-    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
-  outputs :
-    output : Out
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : bilinear (bilinear_tensor_product)
-  backward: bilinear_grad (bilinear_tensor_product_grad)
-  inputs :
-    {x : X, y : Y,weight: Weight, bias: Bias}
-  outputs :
-    {out : Out}
-
-- op : bilinear_interp (bilinear_interp_v2)
-  backward : bilinear_interp_grad (bilinear_interp_v2_grad)
-  inputs :
-    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
-  outputs :
-    output : Out
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : bincount
-  inputs :
-    {x : X, weights : Weights}
-  outputs :
-    out : Out
-  scalar:
-    minlength:
-      data_type : int
-      support_tensor : true
-  get_expected_kernel_type :
-    bincount : GetBincountExpectedKernelType
-
-- op : bitwise_and
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-
-- op : bitwise_not
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out}
-
-- op : bitwise_or
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-
-- op : bitwise_xor
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-
-- op : bmm
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : bn_act_xpu
-  attrs:
-    data_format: data_layout
-
-- op : box_coder
-  inputs :
-    {prior_box : PriorBox , prior_box_var : PriorBoxVar, target_box: TargetBox}
-  outputs :
-    output_box : OutputBox
-
-- op : broadcast_tensors
-  backward : broadcast_tensors_grad
-  inputs :
-    input : X
-  outputs :
-    out : Out
-  drop_empty_grad : [input_grad]
-
-- op : c_concat
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : c_embedding
-  backward : c_embedding_grad
-  inputs :
-    {weight : W, x : Ids}
-  outputs :
-    out : Out
-
-- op : c_softmax_with_cross_entropy
-  backward : c_softmax_with_cross_entropy_grad
-  inputs :
-    {logits : Logits, label : Label}
-  outputs :
-    {softmax : Softmax, loss : Loss}
-
-- op : cast
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : ceil
-  backward : ceil_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : celu
-  backward : celu_grad, celu_double_grad(celu_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : check_finite_and_unscale_(check_finite_and_unscale)
-  inputs :
-    {x : X, scale: Scale}
-  outputs :
-    {out : Out, found_infinite: FoundInfinite}
-  get_expected_kernel_type :
-    check_finite_and_unscale_ : GetCheckFiniteAndUnscaleExpectedKernelType
-
-- op : cholesky
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : cholesky_solve
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : class_center_sample
-  inputs :
-    label : Label
-  outputs :
-    {remapped_label : RemappedLabel, sampled_local_class_center : SampledLocalClassCenter}
-
-- op : clip
-  backward : clip_grad, clip_double_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar :
-    min :
-      data_type : float
-      tensor_name : Min
-    max :
-      data_type :  float
-      tensor_name : Max
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- op : clip_by_norm
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : coalesce_tensor
-  inputs :
-    {input : Input}
-  outputs :
-    {output : Output, fused_output : FusedOutput}
-  attrs :
-    {size_of_dtype : user_defined_size_of_dtype}
-
-- op : complex
-  backward : complex_grad
-  inputs :
-    {real : X, imag : Y}
-  outputs :
-    out : Out
-
-- op : concat
-  backward : concat_grad, concat_double_grad
-  inputs:
-    x: X
-  outputs:
-    out: Out
-  attrs:
-    axis: axis
-  scalar :
-    axis :
-      data_type : int
-      tensor_name : AxisTensor
-  drop_empty_grad : [x_grad]
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
-  get_expected_kernel_type :
-    concat : GetConcatExpectedKernelType
-
-- op : conditional_block
-  backward : conditional_block_grad
-  extra :
-    attrs : ['str[] skip_eager_deletion_vars = {}']
-
-- op : conj
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : conv2d
-  backward : conv2d_grad, conv2d_grad_grad
-  inputs :
-    {input : Input, filter : Filter}
-  outputs :
-    out : Output
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_addto = false,
-             bool force_fp32_output = false,
-             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-  get_expected_kernel_type :
-    conv2d : GetConvExpectedKernelType
-
-- op : conv2d_transpose
-  backward : conv2d_transpose_grad, conv2d_transpose_double_grad (conv2d_transpose_grad_grad)
-  inputs :
-    {x : Input, filter : Filter, bias : Bias}
-  outputs :
-    out : Output
-  int_array :
-    output_size :
-      data_type : int
-      support_tensor : true
-  extra :
-    inputs : [bias]
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool force_fp32_output = false,
-             str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
-             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
-
-- op : conv3d
-  backward : conv3d_grad, conv3d_double_grad (conv3d_grad_grad)
-  inputs :
-    {input : Input, filter : Filter}
-  outputs :
-    out : Output
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
-             bool use_addto = false, bool fuse_residual_connection = false, bool force_fp32_output = false,
-             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-  get_expected_kernel_type :
-    conv3d : GetConvExpectedKernelType
-
-- op : conv3d_transpose
-  backward : conv3d_transpose_grad
-  inputs :
-    {x : Input, filter : Filter}
-  outputs :
-    out : Output
-  extra :
-    attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
-
-- op : cos
-  backward : cos_grad, cos_double_grad, cos_triple_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : cosh
-  backward : cosh_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : crop (crop_tensor)
-  backward : crop_grad (crop_tensor_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  int_array:
-    shape :
-      data_type : int
-      tensor_name : Shape
-      tensors_name : ShapeTensor
-    offsets :
-      data_type : int
-      tensor_name : Offsets
-      tensors_name : OffsetsTensor
-
-- op : cross
-  inputs :
-    {x : X, y : Y}
-  attrs :
-    axis : dim
-  outputs :
-    out : Out
-
-- op : cross_entropy_with_softmax (softmax_with_cross_entropy)
-  backward : cross_entropy_with_softmax_grad (softmax_with_cross_entropy_grad)
-  inputs :
-    {input : Logits, label : Label}
-  outputs :
-    {softmax : Softmax, loss : Loss}
-
-- op : cumprod
-  backward : cumprod_grad
-  inputs :
-    x : X
-  attrs :
-    dim : dim
-  outputs :
-    out : Out
-
-- op : cumsum
-  backward: cumsum_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar:
-    axis:
-      data_type : int
-      support_tensor : true
-
-- op : data_norm
-  backward : data_norm_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : decode_jpeg
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : deformable_conv
-  backward : deformable_conv_grad
-  inputs :
-    {x : Input, offset : Offset, filter : Filter, mask : Mask}
-  outputs :
-    out : Output
-
-- op : depthwise_conv2d
-  backward : depthwise_conv2d_grad, depthwise_conv2d_double_grad (depthwise_conv2d_grad_grad)
-  inputs :
-    {input : Input, filter : Filter}
-  outputs :
-    out : Output
-  attrs :
-    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
-             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
-             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
-             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
-             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-  get_expected_kernel_type :
-    depthwise_conv2d : GetConvExpectedKernelType
-
-- op : depthwise_conv2d_transpose
-  backward : depthwise_conv2d_transpose_grad
-  inputs :
-    {x : Input, filter : Filter, bias: Bias}
-  outputs :
-    out : Output
-  int_array :
-    output_size :
-      data_type : int
-      support_tensor : true
-  extra :
-    inputs : [bias]
-    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool force_fp32_output = false,
-             str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
-             int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
-
-- op : dequantize
-  inputs :
-    input : Input
-  outputs :
-    output : Output
-  attrs :
-    {scale : Scale, shift : Shift}
-
-- op : dequantize_linear
-  extra :
-    attrs : [float moving_rate = 0.9]
-
-- op : det (determinant)
-  backward : det_grad (determinant_grad)
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- op : diag (diag_v2)
-  backward : diag_grad (diag_v2_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : diag_embed
-  inputs :
-    input : Input
-  outputs :
-    out : Out
-
-- op : diagonal
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- op : digamma
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : dirichlet
-  inputs :
-    alpha : Alpha
-  outputs :
-    out : Out
-
-- op : dist
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : distributed_push_sparse
-  extra :
-    attrs : ['int[] slots = {}']
-
-- op : divide (elementwise_div)
-  backward : divide_grad (elementwise_div_grad)
-  inputs :
-    {x: X, y : Y}
-  outputs :
-    out: Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-
-- op : dot
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : dropout
-  backward : dropout_grad
-  inputs :
-    x : X
-    seed_tensor : Seed
-  outputs :
-    out : Out
-    mask : Mask
-  attrs :
-    p : dropout_prob
-    is_test : is_test
-    mode : dropout_implementation
-    seed : seed
-    fix_seed : fix_seed
-  extra :
-    attrs : [bool fix_seed = false, int seed = 0]
-
-- op : dropout_nd
-  backward : dropout_nd_grad
-  extra :
-    attrs : [bool fix_seed = false, int seed = 0]
-
-- op : edit_distance
-  inputs :
-    hyps : Hyps
-    refs : Refs
-    hypslength : HypsLength
-    refslength : RefsLength
-  outputs :
-    sequencenum : SequenceNum
-    out : Out
-
-- op : eig
-  inputs :
-    x : X
-  outputs :
-    out_w : Eigenvalues
-    out_v : Eigenvectors
-
-- op : eigh
-  inputs :
-    x : X
-  outputs :
-    out_w : Eigenvalues
-    out_v : Eigenvectors
-
-- op : eigvals
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : eigvalsh
-  backward : eigvalsh_grad
-  inputs :
-    {x : X}
-  outputs :
-    {eigenvalues : Eigenvalues, eigenvectors : Eigenvectors}
-  attrs :
-    uplo : UPLO
-
-- op : einsum
-  backward : einsum_grad
-  inputs :
-    x : Operands
-  outputs:
-    {out : Out, inner_cache: InnerCache, xshape : XShape}
-  drop_empty_grad: [x_grad]
-  extra:
-    outputs: [inner_cache, xshape]
-
-- op : elementwise_pow
-  backward : elementwise_pow_grad
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [elementwise_pow]
-
-- op : elu
-  backward : elu_grad, elu_double_grad (elu_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : embedding (lookup_table_v2)
-  backward : embedding_grad (lookup_table_v2_grad)
-  inputs :
-    {x : Ids, weight : W}
-  outputs :
-    out : Out
-  attrs :
-   sparse : is_sparse
-  manual_signature : [embedding_grad]
-  extra :
-    attrs : [bool is_sparse = false, bool is_distributed = false, bool remote_prefetch = false,
-             int trainer_id = 0, int slot = 0, 'int64_t[] height_sections = {}', 'str[] epmap = {}',
-             'str[] table_names = {}']
-
-- op : empty
-  outputs :
-    out : Out
-  int_array:
-    shape :
-      data_type : int64_t
-      tensor_name : ShapeTensor
-      tensors_name : ShapeTensorList
-
-- op : equal
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : equal_all
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : erf
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : erfinv
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : exp
-  backward : exp_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : expand (expand_v2)
-  backward : expand_grad (expand_v2_grad), expand_double_grad(expand_v2_double_grad)
-  inputs :
-    x : X
-  attrs :
-   shape : shape
-  outputs :
-    out : Out
-  int_array:
-    shape :
-      data_type : int
-      tensor_name : Shape
-      tensors_name : expand_shapes_tensor
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-  manual_signature : [expand, expand_grad]
-
-- op : expand_as (expand_as_v2)
-  backward : expand_as_grad (expand_as_v2_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : expm1
-  backward : expm1_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : exponential_ (exponential)
-  backward : exponential__grad (exponential_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs :
-    lam : lambda
-
-- op : eye
-  outputs :
-    out : Out
-  scalar :
-    num_rows :
-      support_tensor : true
-    num_columns :
-      support_tensor : true
-
-- op : fake_channel_wise_quantize_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fake_channel_wise_quantize_dequantize_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fake_quantize_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fake_quantize_dequantize_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fake_quantize_dequantize_moving_average_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fake_quantize_moving_average_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fake_quantize_range_abs_max
-  extra :
-    attrs : [int round_type = 1]
-
-- op : fc
-  inputs :
-    input : Input
-    w : W
-    bias : Bias
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE = true, bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", float Scale_in = 1.0f, "float[] Scale_weights = {1.0f}", float Scale_out = 1.0f, bool force_fp32_output = false]
-
-- op : feed
-  outputs: {out: Out}
-
-- op : fft_c2c
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- op : fft_c2r
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- op : fft_r2c
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- op : fill (fill_any)
-  backward : fill_grad (fill_any_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar :
-    value :
-      data_type : float
-      support_tensor : true
-
-- op : fill_diagonal
-  backward : fill_diagonal_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : fill_diagonal_tensor
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : flatten (flatten_contiguous_range)
-  backward : flatten_grad (flatten_contiguous_range_grad)
-  inputs :
-    x : X
-  outputs :
-    {out : Out, xshape : XShape}
-  attrs :
-    {start_axis : start_axis, stop_axis : stop_axis}
-  extra :
-    outputs : [xshape]
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-  manual_signature : [flatten, flatten_grad]
-
-- op : flip
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : floor
-  backward : floor_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : floor_divide (elementwise_floordiv)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [floor_divide]
-
-- op : fmax (elementwise_fmax)
-  backward : fmax_grad (elementwise_fmax_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [fmax]
-
-- op : fmin (elementwise_fmin)
-  backward : fmin_grad (elementwise_fmin_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [fmin]
-
-- op : fold
-  inputs :
-    x : X
-  outputs :
-    out : Y
-
-- op : frame
-  backward : frame_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : frobenius_norm
-  backward : frobenius_norm_grad
-  inputs:
-    x : X
-  attrs:
-    { axis : dim,  keepdim : keep_dim}
-  outputs:
-    out : Out
-  int_array:
-    axis :
-      data_type : int
-      support_tensor : true
-  get_expected_kernel_type :
-    frobenius_norm : GetReduceExpectedKernelType
-    frobenius_norm_grad : GetReduceGradExpectedKernelType
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : full (fill_constant)
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : full_like (fill_any_like)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar :
-    value :
-      data_type : float
-      support_tensor : true
-
-- op : fused_attention
-  backward: fused_attention_grad
-  inputs:
-    x: X
-    ln_scale: LnScale
-    ln_bias: LnBias
-    qkv_weight: QKVW
-    qkv_bias: QKVBias
-    cache_kv: CacheKV
-    src_mask: SrcMask
-    out_linear_weight: OutLinearW
-    out_linear_bias: OutLinearBias
-    ln_scale_2: Ln2Scale
-    ln_bias_2: Ln2Bias
-  outputs:
-    ln_mean: LnMean
-    ln_var: LnVariance
-    ln_out: LnOut
-    qkv_out: QKVOut
-    qkv_bias_out: QKVBiasOut
-    transpose_out_2: TransposeOut2
-    qk_out: QKOut
-    qktv_out: QKTVOut
-    softmax_out: SoftmaxOut
-    attn_dropout_mask_out: AttnDropoutMaskOut
-    attn_dropout_out: AttnDropoutOut
-    src_mask_out: SrcMaskOut
-    fmha_out: FMHAOut
-    out_linear_out: OutLinearOut
-    dropout_mask_out: DropoutMaskOut
-    ln_mean_2: Ln2Mean
-    ln_var_2: Ln2Variance
-    bias_dropout_residual_out: BiasDropoutResidualOut
-    cache_kv_out: CacheKVOut
-    out: Y
-
-- op : fused_batch_norm_act
-  backward : fused_batch_norm_act_grad
-  inputs:
-    x : X
-    mean : Mean
-    variance : Variance
-    scale : Scale
-    bias : Bias
-  outputs :
-    out : Y
-    mean_out: MeanOut
-    variance_out: VarianceOut
-    saved_mean: SavedMean
-    saved_variance: SavedVariance
-    reserve_space: ReserveSpace
-
-- op : fused_bias_dropout_residual_layer_norm
-  backward : fused_bias_dropout_residual_layer_norm_grad
-  inputs :
-    x : X
-    residual : Residual
-    bias : Bias
-    ln_scale : LnScale
-    ln_bias : LnBias
-  outputs :
-    bias_dropout_residual_out : BiasDropoutResidualOut
-    dropout_mask_out : DropoutMaskOut
-    ln_mean : LnMean
-    ln_variance : LnVariance
-    y : Y
-
-- op : fused_bn_add_activation_ (fused_bn_add_activation)
-  backward : fused_bn_add_activation_grad
-  inputs:
-    x : X
-    z : Z
-    mean : Mean
-    variance : Variance
-    scale : Scale
-    bias : Bias
-  outputs :
-    out : Y
-    mean_out: MeanOut
-    variance_out: VarianceOut
-    saved_mean: SavedMean
-    saved_variance: SavedVariance
-    reserve_space: ReserveSpace
-
-- op : fused_conv2d
-  inputs :
-    {input : Input, filter : Filter, bias : Bias, residual_param : ResidualData}
-  outputs :
-    {output : Output}
-  attrs :
-    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
-  extra :
-    attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
-             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
-
-- op : fused_conv2d_add_act
-  inputs :
-    input : Input
-    filter : Filter
-    bias : Bias
-    residual_data : ResidualData
-  outputs :
-    output : Output
-    outputs : Outputs
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
-             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_beta = 0.0f, bool use_addto = false,
-             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
-             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false]
-  get_expected_kernel_type :
-    fused_conv2d_add_act : GetConvExpectedKernelType
-
-- op : fused_conv3d
-  inputs :
-    {input : Input, filter : Filter, bias : Bias, residual_param : ResidualData}
-  outputs :
-    {output : Output}
-  attrs :
-    {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
-  extra :
-    attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
-             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
-
-- op : fused_embedding_eltwise_layernorm
-  inputs :
-    ids : Ids
-    embs : Embs
-    bias : Bias
-    scale : Scale
-  outputs :
-    out : Out
-
-- op : fused_fc_elementwise_layernorm
-  inputs :
-    x : X
-    w : W
-    y : Y
-    bias0 : Bias0
-    scale : Scale
-    bias1 : Bias1
-  outputs :
-    out : Out
-    mean : Mean
-    variance : Variance
-
-- op : fused_feedforward
-  backward: fused_feedforward_grad
-  inputs:
-    x: X
-    dropout1_seed: Dropout1Seed
-    dropout2_seed: Dropout2Seed
-    linear1_weight: Linear1Weight
-    linear1_bias: Linear1Bias
-    linear2_weight: Linear2Weight
-    linear2_bias: Linear2Bias
-    ln1_scale: Ln1Scale
-    ln1_bias: Ln1Bias
-    ln2_scale: Ln2Scale
-    ln2_bias: Ln2Bias
-  attrs:
-    dropout1_seed_val: dropout1_seed
-    dropout2_seed_val: dropout2_seed
-    dropout1_prob: dropout1_rate
-    dropout2_prob: dropout2_rate
-  outputs:
-    out: Out
-    dropout1_mask: Dropout1Mask
-    dropout2_mask: Dropout2Mask
-    ln1_mean: Ln1Mean
-    ln1_variance: Ln1Variance
-    ln2_mean: Ln2Mean
-    ln2_variance: Ln2Variance
-    linear1_out: Linear1Out
-    ln1_out: Ln1Out
-    dropout1_out: Dropout1Out
-    dropout2_out: Dropout2Out
-
-- op : fused_gemm_epilogue
-  inputs:
-    {x : X, y : Y, bias : Bias}
-  outputs :
-    {out : Out, reserve_space: ReserveSpace}
-
-- op : fused_gemm_epilogue_grad
-  inputs:
-    {x : X, y : Y, reserve_space: ReserveSpace, out_grad : DOut}
-  outputs :
-    {x_grad : DX, y_grad : DY, bias_grad : DBias}
-
-- op : fused_transpose
-  extra :
-    attrs : [str data_format = "AnyLayout"]
-
-- op : fusion_gru
-  inputs :
-    x : X
-    h0 : H0
-    weight_x : WeightX
-    weight_h : WeightH
-    bias : Bias
-  outputs :
-    reordered_h0 : ReorderedH0
-    xx : XX
-    batched_input : BatchedInput
-    batched_out : BatchedOut
-    hidden : Hidden
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}']
-
-- op : fusion_lstm
-  extra :
-    attrs : [bool use_mkldnn = true, str mkldnn_data_type = "float32"]
-
-- op : fusion_repeated_fc_relu
-  inputs :
-    x : X
-    w : W
-    bias : Bias
-  outputs :
-    relu_out : ReluOut
-    out : Out
-
-- op : fusion_seqconv_eltadd_relu
-  inputs :
-    x : X
-    filter : Filter
-    bias : Bias
-  outputs :
-    out : Out
-    col_mat : ColMat
-  attrs :
-    context_length : contextLength
-    context_start : contextStart
-    context_stride : contextStride
-
-- op : fusion_seqexpand_concat_fc
-  inputs :
-    x : X
-    fc_weight : FCWeight
-    fc_bias : FCBias
-  outputs :
-    out : Out
-    fc_out : FCOut
-
-- op : fusion_transpose_flatten_concat
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : gather
-  backward : gather_grad
-  inputs :
-    {x : X, index : Index}
-  outputs :
-    out : Out
-  scalar :
-    axis :
-      data_type : int
-      tensor_name : Axis
-
-- op : gather_nd
-  backward : gather_nd_grad
-  inputs :
-    {x : X, index : Index}
-  outputs :
-    out : Out
-
-- op : gather_tree
-  inputs :
-    {ids : Ids, parents : Parents}
-  outputs :
-    out : Out
-
-- op : gaussian (gaussian_random)
-  outputs :
-    out : Out
-  int_array:
-    shape :
-      data_type : int64_t
-      tensor_name : ShapeTensor
-      tensors_name : ShapeTensorList
-  extra :
-    attrs : [bool use_mkldnn = false]
-  manual_signature : [gaussian]
-
-- op : gelu
-  backward : gelu_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- op : generate_proposals(generate_proposals_v2)
-  inputs :
-    {scores : Scores, bbox_deltas : BboxDeltas, im_shape : ImShape, anchors : Anchors, variances : Variances}
-  outputs :
-    {rpn_rois : RpnRois, rpn_roi_probs : RpnRoiProbs, rpn_rois_num : RpnRoisNum}
-  attrs :
-    {pre_nms_top_n : pre_nms_topN, post_nms_top_n : post_nms_topN}
-
-- op : grad_add
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-
-- op : graph_khop_sampler
-  inputs :
-    {row : Row, colptr : Col_Ptr, x : X, eids : Eids}
-  outputs :
-    {out_src : Out_Src, out_dst : Out_Dst, sample_index : Sample_Index, reindex_x : Reindex_X, out_eids : Out_Eids}
-
-- op : graph_sample_neighbors
-  inputs :
-    {row : Row, colptr : Col_Ptr, x : X, eids : Eids, perm_buffer : Perm_Buffer}
-  outputs :
-    {out : Out, out_count : Out_Count, out_eids : Out_Eids}
-
-- op : greater_equal
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : greater_than
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : grid_sample(grid_sampler)
-  backward : grid_sample_grad (grid_sampler_grad)
-  inputs :
-    {x : X, grid : Grid}
-  outputs :
-    out : Output
-  extra :
-    attrs : [bool use_cudnn = true]
-
-- op : group_norm
-  inputs :
-    x : X
-    scale : Scale
-    bias : Bias
-  outputs :
-    y : Y
-    mean : Mean
-    variance : Variance
-  attrs:
-    data_format: data_layout
-
-- op : gru
-  backward : gru_grad
-  extra :
-    attrs : [bool is_test = false]
-
-- op : gumbel_softmax
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : hardshrink (hard_shrink)
-  backward : hardshrink_grad (hard_shrink_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : hardsigmoid (hard_sigmoid)
-  backward : hardsigmoid_grad (hard_sigmoid_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : hardswish (hard_swish)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  backward : hardswish_grad (hard_swish_grad)
-  extra :
-    attrs : [bool use_mkldnn = false]
-  manual_signature : [hardswish]
-
-- op : hardtanh (brelu)
-  backward : hardtanh_grad (brelu_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : heaviside (elementwise_heaviside)
-  backward : heaviside_grad (elementwise_heaviside_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-
-- op : histogram
-  inputs :
-    input : X
-  outputs :
-    out : Out
-
-- op : hsigmoid_loss(hierarchical_sigmoid)
-  backward: hsigmoid_loss_grad(hierarchical_sigmoid_grad)
-  inputs:
-   {x: X, w: W, label: Label, bias: Bias, path: PathTable, code: PathCode}
-  outputs:
-   {out: Out, pre_out: PreOut, w_out: W_Out}
-
-- op : huber_loss
-  backward : huber_loss_grad
-  inputs :
-    {input : X, label : Y}
-  outputs :
-    {out : Out, residual : Residual}
-
-- op : imag
-  backward : imag_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : increment
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : index_add
-  inputs :
-    {x : X, index : Index, add_value : AddValue}
-  outputs :
-    out : Out
-
-- op : index_sample
-  inputs :
-    {x : X, index : Index}
-  outputs :
-    out : Out
-
-- op : index_select
-  inputs :
-    {x : X, index : Index}
-  outputs :
-    out : Out
-  attrs :
-    axis : dim
-
-- op : instance_norm
-  inputs :
-    x : X
-    scale : Scale
-    bias : Bias
-  outputs :
-    y : Y
-    saved_mean : SavedMean
-    saved_variance : SavedVariance
-  extra:
-    outputs: [ saved_mean, saved_variance ]
-  get_expected_kernel_type:
-    instance_norm: GetInstanceNormExpectedKernelType
-
-- op : inverse
-  inputs :
-    x : Input
-  outputs :
-    out : Output
-
-- op : is_empty
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : isclose
-  inputs :
-    {x : Input, y : Other}
-  outputs :
-    out : Out
-  scalar :
-    rtol :
-      data_type : std::string
-      tensor_name : Rtol
-    atol :
-      data_type : std::string
-      tensor_name : Atol
-
-- op : isfinite (isfinite_v2)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : isinf (isinf_v2)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : isnan (isnan_v2)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : kldiv_loss
-  backward : kldiv_loss_grad
-  inputs :
-    {x : X, label : Target}
-  outputs :
-    out : Loss
-
-- op : kron
-  backward : kron_grad
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  complex_promote : [X, Y]
-
-- op : kthvalue
-  inputs :
-    x : X
-  outputs :
-    {out : Out, indices : Indices}
-
-- op : label_smooth
-  inputs :
-    {label : X, prior_dist : PriorDist}
-  outputs :
-    out : Out
-
-- op : lamb_ (lamb)
-  inputs :
-    {param : Param, grad : Grad, learning_rate : LearningRate, moment1 : Moment1, moment2 : Moment2, beta1_pow : Beta1Pow, beta2_pow : Beta2Pow, master_param : MasterParam, skip_update : SkipUpdate}
-  outputs :
-    {param_out : ParamOut, moment1_out : Moment1Out, moment2_out : Moment2Out, beta1_pow_out : Beta1PowOut, beta2_pow_out : Beta2PowOut, master_param_outs : MasterParamOut}
-
-- op : layer_norm
-  backward : layer_norm_grad
-  inputs :
-    x : X
-    scale : Scale
-    bias : Bias
-  outputs :
-    out : Y
-    mean : Mean
-    variance : Variance
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
-  get_expected_kernel_type :
-    layer_norm : GetLayerNormExpectedKernelType
-
-- op : leaky_relu
-  backward : leaky_relu_grad, leaky_relu_double_grad (leaky_relu_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs:
-    negative_slope : alpha
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : lerp
-  backward : lerp_grad
-  inputs :
-    {x : X, y : Y, weight : Weight}
-  outputs :
-    out : Out
-
-- op : less_equal
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : less_than
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : lgamma
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : linear_interp (linear_interp_v2)
-  backward : linear_interp_grad (linear_interp_v2_grad)
-  inputs :
-    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
-  outputs :
-    output : Out
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : linspace
-  inputs :
-    {start : Start, stop : Stop, number : Num}
-  outputs :
-    out : Out
-
-- op : log
-  backward : log_grad, log_double_grad (log_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : log10
-  backward : log10_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : log1p
-  backward : log1p_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : log2
-  backward : log2_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : log_loss
-  backward : log_loss_grad
-  inputs :
-    {input : Predicted, label : Labels}
-  outputs :
-    out : Loss
-
-- op : log_softmax
-  backward : log_softmax_grad
-  inputs :
-    x : X
-  outputs :
-    out: Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : logcumsumexp
-  backward : logcumsumexp_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : logical_and
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : logical_not
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : logical_or
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : logical_xor
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : logit
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : logsigmoid
-  backward : logsigmoid_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : logsumexp
-  backward : logsumexp_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : lrn
-  backward : lrn_grad
-  inputs :
-    x : X
-  outputs :
-    {out : Out, mid_out : MidOut}
-  extra :
-    attrs : [bool use_mkldnn = false, bool is_test = false]
-
-- op : lstsq
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {solution : Solution, residuals : Residuals, rank : Rank, singular_values : SingularValues}
-  scalar :
-    rcond :
-      data_type : float
-      support_tensor : true
-
-- op : lu_unpack
-  backward : lu_unpack_grad
-  inputs :
-    {x : X, y : Pivots}
-  outputs :
-    {pmat : Pmat, l : L, u : U}
-
-- op : margin_cross_entropy
-  backward : margin_cross_entropy_grad
-  inputs:
-    {logits : Logits, label : Label}
-  outputs:
-    {softmax : Softmax, loss : Loss}
-
-- op : masked_select
-  inputs :
-    {x : X, mask : Mask}
-  outputs :
-    out : Y
-
-- op : matmul (matmul_v2)
-  backward : matmul_grad (matmul_v2_grad), matmul_double_grad (matmul_v2_grad_grad), matmul_triple_grad (matmul_v2_triple_grad)
-  inputs :
-    {x : X, y : Y}
-  attrs :
-    {transpose_x : trans_x, transpose_y : trans_y}
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-  complex_promote : [X, Y]
-
-- op : matmul_with_flatten (mul)
-  backward : matmul_with_flatten_grad (mul_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}',
-             float scale_out = 1.0f, bool force_fp32_output = false]
-
-- op : matrix_nms
-  inputs :
-    {bboxes : BBoxes, scores : Scores}
-  outputs :
-    {out : Out, index : Index, roisnum : RoisNum}
-  get_expected_kernel_type :
-    matrix_nms : GetMatrixNmsExpectedKernelType
-
-- op : matrix_power
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : matrix_rank
-  inputs :
-    {x : X, tol_tensor : TolTensor}
-  outputs :
-    out : Out
-  manual_signature : [matrix_rank]
-
-- op : max (reduce_max)
-  backward : max_grad (reduce_max_grad)
-  inputs:
-    x : X
-  attrs:
-    { axis : dim,  keepdim : keep_dim}
-  outputs:
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-  int_array:
-    axis :
-      data_type : int
-      support_tensor : true
-  get_expected_kernel_type :
-    max : GetReduceExpectedKernelType
-    max_grad : GetReduceGradExpectedKernelType
-  manual_signature : [max]
-
-- op : max_pool2d_with_index
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out, mask : Mask}
-  attrs :
-    kernel_size : ksize
-
-- op : max_pool3d_with_index
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out, mask : Mask}
-  attrs :
-    kernel_size : ksize
-
-- op : maximum (elementwise_max)
-  backward : maximum_grad (elementwise_max_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [maximum]
-
-- op : maxout
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : mean (reduce_mean)
-  backward : mean_grad (reduce_mean_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs :
-    {axis : dim, keepdim : keep_dim}
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : mean_all (mean)
-  backward : mean_all_grad (mean_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : merge_selected_rows
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : merged_adam_
-  inputs :
-    {param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam}
-  outputs :
-    {param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
-  scalar :
-    beta1 :
-      data_type : float
-      support_tensor : true
-    beta2 :
-      data_type : float
-      support_tensor : true
-    epsilon :
-      data_type : float
-      support_tensor : true
-
-- op : merged_momentum_ (merged_momentum)
-  inputs :
-    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
-  outputs :
-    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
-
-- op : meshgrid
-  backward : meshgrid_grad
-  inputs :
-    inputs : X
-  outputs :
-    out : Out
-  drop_empty_grad : [inputs_grad]
-
-- op : min (reduce_min)
-  backward : min_grad (reduce_min_grad)
-  inputs:
-    x : X
-  outputs:
-    out : Out
-  attrs:
-    { axis : dim,  keepdim : keep_dim}
-  extra :
-    attrs : [bool use_mkldnn = false]
-  int_array:
-    axis :
-      data_type : int
-      support_tensor : true
-  get_expected_kernel_type :
-    min : GetReduceExpectedKernelType
-    min_grad : GetReduceGradExpectedKernelType
-  manual_signature : [min]
-
-- op : minimum (elementwise_min)
-  backward : minimum_grad (elementwise_min_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [minimum]
-
-- op : mish
-  backward : mish_grad
-  inputs:
-    {x : X, lambda : threshold}
-  outputs:
-    out: Out
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : mode
-  backward : mode_grad
-  inputs :
-    x : X
-  outputs :
-    {out : Out, indices : Indices}
-
-- op : momentum_ (momentum)
-  inputs :
-    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
-  outputs :
-    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
-
-- op : multi_dot
-  backward : multi_dot_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  drop_empty_grad : [x_grad]
-
-- op : multiclass_nms3
-  inputs :
-    {bboxes : BBoxes, scores : Scores, rois_num : RoisNum}
-  outputs :
-    {out : Out, index : Index, nms_rois_num : NmsRoisNum}
-
-- op : multihead_matmul
-  inputs :
-    {input : Input, w : W, bias : Bias, bias_qk : BiasQK}
-  outputs :
-    out : Out
-  attrs :
-    {transpose_q : transpose_Q,  transpose_k : transpose_K, transpose_v : transpose_V}
-
-- op : multinomial
-  inputs :
-    {x : X}
-  outputs :
-    out : Out
-  scalar :
-    num_samples :
-      data_type : int
-      support_tensor : true
-
-- op : multiplex
-  backward : multiplex_grad
-  inputs :
-    {inputs : X, index : Ids}
-  outputs :
-    out : Out
-  drop_empty_grad : [inputs_grad]
-
-- op : multiply (elementwise_mul)
-  backward : multiply_grad (elementwise_mul_grad)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-
-- op : mv
-  inputs :
-    {x : X, vec : Vec}
-  outputs :
-    out : Out
-
-- op : nanmedian
-  backward : nanmedian_grad
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out, medians : MedianIndex}
-  int_array:
-    axis:
-      data_type : int
-  extra:
-    outputs : [medians]
-
-- op : nce
-  backward : nce_grad
-  extra :
-    attrs : [int trainer_id = 0, 'int64_t[] height_sections = {}', 'str[] epmap = {}',
-             'str[] table_names = {}', 'int[] custom_neg_classes = {}']
-
-- op : nearest_interp (nearest_interp_v2)
-  backward : nearest_interp_grad (nearest_interp_v2_grad)
-  inputs :
-    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
-  outputs :
-    output : Out
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : nll_loss
-  backward : nll_loss_grad
-  inputs :
-    {input : X, label : Label, weight : Weight}
-  outputs :
-    {out : Out, total_weight : Total_weight}
-
-- op : nms
-  inputs :
-    x : Boxes
-  outputs :
-    out : KeepBoxesIdxs
-  attrs :
-    threshold : iou_threshold
-
-- op : nonzero (where_index)
-  inputs :
-    condition : Condition
-  outputs :
-    out : Out
-
-- op : norm
-  backward : norm_grad
-  inputs :
-    x : X
-  outputs :
-    {out : Out, norm : Norm}
-  extra :
-    outputs : [norm]
-
-- op : not_equal
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : numel(size)
-  inputs :
-    x : Input
-  outputs :
-    size : Out
-
-- op : one_hot (one_hot_v2)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar :
-    depth :
-      data_type : int
-      tensor_name : depth_tensor
-
-- op : overlap_add
-  backward : overlap_add_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : p_norm
-  backward: p_norm_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : pad
-  backward : pad_grad, pad_double_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar:
-    pad_value:
-      data_type : float
-      support_tensor : true
-
-- op : pad2d
-  backward : pad2d_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : pad3d
-  backward : pad3d_grad, pad3d_double_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  int_array:
-    paddings :
-      data_type : int
-      tensor_name : Paddings
-  attrs :
-    pad_value : value
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : partial_sum
-  backward : partial_sum_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : pixel_shuffle
-  backward : pixel_shuffle_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : pixel_unshuffle
-  backward : pixel_unshuffle_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : poisson
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : pool2d
-  backward : pool2d_grad, pool2d_double_grad
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out}
-  attrs :
-    {kernel_size : ksize}
-  int_array:
-    kernel_size :
-      data_type : int
-      support_tensor : true
-  get_expected_kernel_type :
-    pool2d : GetPoolExpectedKernelType
-    pool2d_grad : GetPoolExpectedKernelType
-    pool2d_double_grad : GetPoolDoubleGradExpectedKernelType
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false,
-              str mkldnn_data_type = "float32", bool is_test = false]
-
-- op : pool3d
-  backward : pool3d_grad
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out}
-  attrs :
-    {kernel_size : ksize}
-  get_expected_kernel_type :
-    pool3d : GetPoolExpectedKernelType
-    pool3d_grad : GetPoolExpectedKernelType
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : pow
-  backward : pow_grad, pow_double_grad, pow_triple_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs :
-    y : factor
-  scalar :
-    y :
-      data_type : float
-      tensor_name : FactorTensor
-
-- op : prelu
-  backward : prelu_grad
-  inputs :
-    { x : X, alpha : Alpha}
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
-
-- op : print
-  inputs :
-    in : In
-  outputs :
-    out : Out
-
-- op : prior_box
-  inputs :
-    {input: Input, image: Image}
-  outputs :
-    {out: Boxes, var: Variances}
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
-
-- op : prod (reduce_prod)
-  backward : prod_grad (reduce_prod_grad)
-  inputs:
-    x : X
-  outputs:
-    out : Out
-  attrs:
-    { dims : dim,  keep_dim : keep_dim}
-  int_array:
-    dims :
-      data_type : int
-      support_tensor : true
-  extra :
-    attrs : [bool use_mkldnn = false]
-  get_expected_kernel_type :
-    prod : GetReduceExpectedKernelType
-    prod_grad : GetReduceGradExpectedKernelType
-  manual_signature : [prod]
-
-- op : psroi_pool
-  backward : psroi_pool_grad
-  inputs :
-    {x : X, boxes : ROIs, boxes_num : RoisNum}
-  outputs :
-    out : Out
-
-- op : push_sparse_v2
-  inputs :
-    { x : Ids, W : w}
-  outputs :
-    out : Out
-  extra :
-    attrs : [int embeddingdim = 11, int tableid = 0, str accessorclass = "", str ctrlabelname = "", int paddingid = 0, bool scalesparsegrad = true, 'str[] inputnames = {}', bool is_distributed = true]
-
-- op : put_along_axis
-  backward : put_along_axis_grad
-  inputs :
-    {arr : Input, indices : Index, values : Value}
-  outputs :
-    out : Result
-  attrs :
-    {axis : Axis, reduce : Reduce, include_self: Include_self}
-
-- op : pylayer
-  backward : pylayer_grad
-  extra :
-    attrs : ['str[] skip_eager_deletion_vars = {}']
-
-- op : qr
-  backward : qr_grad
-  inputs :
-    x : X
-  outputs :
-    {q : Q, r : R}
-
-- op : quantize
-  inputs :
-    input : Input
-  outputs :
-    output : Output
-  attrs :
-    {scale : Scale, shift : Shift, include_self: Include_self}
-
-- op : quantize_linear
-  extra :
-    attrs : [float moving_rate = 0.9]
-
-- op : randint
-  outputs :
-    out : Out
-  int_array:
-    shape :
-      data_type : int64_t
-      tensor_name : ShapeTensor
-      tensors_name : ShapeTensorList
-  manual_signature : [randint]
-
-- op : randperm
-  outputs :
-    out : Out
-  extra :
-    attrs : [int seed = 0]
-
-- op : real
-  backward : real_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : reciprocal
-  backward : reciprocal_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : relu
-  backward : relu_grad, relu_double_grad (relu_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : relu6
-  backward : relu6_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, float threshold = 6.0]
-
-- op : remainder (elementwise_mod)
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    {out : Out}
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-  manual_signature : [remainder]
-
-- op : renorm
-  backward : renorm_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : repeat_interleave
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs :
-    repeats : Repeats
-
-- op : repeat_interleave
-  backward : repeat_interleave_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs :
-    {repeats : Repeats, axis : dim}
-
-- op : repeat_interleave_with_tensor_index
-  backward : repeat_interleave_with_tensor_index_grad
-  inputs :
-    {x : X, repeats: RepeatTensor}
-  outputs:
-    out : Out
-  attrs:
-    axis : dim
-
-- op : requantize
-  inputs :
-    input : Input
-  outputs :
-    output : Output
-  attrs :
-    {scale_in : Scale_in, scale_out : Scale_out, shift_in : Shift_in, shift_out : Shift_out}
-
-- op : reshape (reshape2)
-  backward : reshape_grad (reshape2_grad)
-  inputs:
-    x : X
-  outputs:
-    out : Out
-    xshape: XShape
-  int_array:
-    shape :
-      data_type : int
-      tensor_name : Shape
-      tensors_name : ShapeTensor
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
-
-- op : reverse
-  inputs:
-    x : X
-  outputs:
-    out : Out
-  int_array:
-    axis :
-      data_type : int
-      support_tensor : true
-  manual_signature : [reverse]
-
-- op : rmsprop_ (rmsprop)
-  inputs :
-    {param: Param, mean_square: MeanSquare, mean_grad: MeanGrad, learning_rate: LearningRate, grad: Grad, moment: Moment, master_param: MasterParam}
-  outputs :
-    {param_out: ParamOut, moment_out: MomentOut, mean_square_out: MeanSquareOut, mean_grad_out: MeanGradOut, master_param_outs: MasterParamOut}
-
-- op : rnn
-  backward : rnn_grad
-  inputs:
-    { x : Input, pre_state : PreState, weight_list : WeightList, sequence_length : SequenceLength}
-  outputs:
-    { out : Out, dropout_state_out : DropoutState, state : State, reserve : Reserve}
-  drop_empty_grad : [pre_state_grad, weight_list_grad]
-
-- op : roi_align
-  backward : roi_align_grad
-  inputs :
-    {x : X, boxes : ROIs, boxes_num : RoisNum}
-  outputs :
-    out : Out
-
-- op : roi_pool
-  backward : roi_pool_grad
-  inputs :
-    {x : X, boxes : ROIs, boxes_num : RoisNum}
-  outputs :
-    {out : Out, arg_max : Argmax}
-
-- op : roll
-  backward : roll_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  int_array :
-    shifts :
-      data_type : int64_t
-      tensor_name : ShiftsTensor
-
-- op : round
-  backward : round_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : row_conv
-  backward : row_conv_grad
-  inputs :
-    {x : X, filter : Filter}
-  outputs :
-    {out : Out}
-
-- op : rsqrt
-  backward : rsqrt_grad, rsqrt_double_grad (rsqrt_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : save_combine
-  inputs :
-    {x : X}
-
-- op : scale
-  backward : scale_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  scalar :
-    scale :
-      data_type : float
-      tensor_name : ScaleTensor
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : scatter
-  backward : scatter_grad
-  inputs :
-    {x : X, index : Ids, updates : Updates}
-  outputs :
-    out : Out
-
-- op : scatter_nd_add
-  backward : scatter_nd_add_grad
-  inputs :
-    {x : X, index : Index, updates : Updates}
-  outputs :
-    out : Out
-
-- op : searchsorted
-  inputs :
-    {sorted_sequence : SortedSequence, values : Values}
-  outputs :
-    out : Out
-
-- op : seed
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
-
-- op : segment_pool
-  backward : segment_pool_grad
-  inputs :
-    {x : X, segment_ids : SegmentIds}
-  outputs :
-    {out : Out, summed_ids : SummedIds}
-
-- op : self_dp_attention
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : selu
-  backward : selu_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : send_u_recv(graph_send_recv)
-  backward : send_u_recv_grad(graph_send_recv_grad)
-  inputs :
-    {x : X, src_index : Src_index, dst_index : Dst_index}
-  outputs :
-    {out : Out, dst_count : Dst_count}
-  int_array :
-    out_size:
-      data_type : int64_t
-      tensor_name : Out_size
-
-- op : send_ue_recv(graph_send_ue_recv)
-  backward : send_ue_recv_grad(graph_send_ue_recv_grad)
-  inputs :
-    {x : X, y : Y, src_index : Src_index, dst_index : Dst_index}
-  outputs :
-    {out : Out, dst_count : Dst_count}
-  int_array :
-    out_size:
-      data_type : int64_t
-      tensor_name : Out_size
-
-- op : send_uv (graph_send_uv)
-  backward : send_uv_grad (graph_send_uv_grad)
-
-- op : sequence_mask
-  inputs:
-    x : X
-  attrs:
-    max_len: maxlen
-  outputs:
-    y : Y
-  scalar :
-    max_len :
-      data_type : int
-      tensor_name : MaxLenTensor
-
-- op : sequence_softmax
-  backward : sequence_softmax_grad
-  extra :
-    attrs : [str data_format = "AnyLayout"]
-
-- op : sgd_ (sgd)
-  inputs :
-    {param : Param, learning_rate : LearningRate, grad : Grad, master_param : MasterParam}
-  outputs :
-    {param_out : ParamOut, master_param_out : MasterParamOut}
-  get_expected_kernel_type :
-    sgd_ : GetSgdExpectedKernelType
-  extra :
-    attrs : [bool use_mkldnn=false]
-
-- op : shape
-  inputs :
-    input : Input
-  outputs :
-    out : Out
-
-- op : shape
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- op : shard_index
-  inputs :
-    input : X
-  outputs :
-    out : Out
-
-- op : share_buffer
-  inputs :
-    x : X
-  outputs :
-    out : Out
-    xout : XOut
-
-- op : shuffle_batch
-  backward: shuffle_batch_grad
-  inputs:
-    {x : X, seed : Seed}
-  outputs:
-    {out : Out, shuffle_idx : ShuffleIdx, seed_out : SeedOut}
-
-- op : shuffle_channel
-  backward : shuffle_channel_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : sigmoid
-  backward : sigmoid_grad, sigmoid_double_grad (sigmoid_grad_grad), sigmoid_triple_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : sign
-  backward : sign_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : silu
-  backward : silu_grad, silu_double_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : sin
-  backward : sin_grad, sin_double_grad, sin_triple_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : sinh
-  backward : sinh_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : slice
-  backward : slice_grad
-  inputs :
-    input : Input
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-  int_array :
-    starts :
-      data_type : int
-      tensor_name : StartsTensor
-      tensors_name : StartsTensorList
-    ends :
-      data_type : int
-      tensor_name : EndsTensor
-      tensors_name : EndsTensorList
-
-- op : slogdet(slogdeterminant)
-  backward : slogdet_grad(slogdeterminant_grad)
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- op : soft_relu
-  backward : soft_relu_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : softmax
-  backward : softmax_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  get_expected_kernel_type :
-    softmax : GetSoftmaxExpectedKernelType
-    softmax_grad : GetSoftmaxGradExpectedKernelType
-  extra :
-    attrs : [str data_format = "AnyLayout", bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
-
-- op : softplus
-  backward : softplus_grad, softplus_double_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : softshrink
-  backward : softshrink_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs :
-    threshold : lambda
-
-- op : softsign
-  backward : softsign_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : solve
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : spectral_norm
-  backward : spectral_norm_grad
-  inputs :
-    {weight : Weight, u : U, v : V}
-  outputs :
-    out : Out
-
-- op : split
-  backward : split_grad
-  inputs:
-    x : X
-  outputs:
-    out : Out
-  int_array:
-      sections :
-          data_type : int
-          support_tensor : true
-  scalar :
-      axis :
-        data_type : int
-        support_tensor : true
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- op : split_with_num
-  scalar :
-      axis :
-        data_type : int
-        support_tensor : true
-        tensor_name : AxisTensor
-
-- op : sqrt
-  backward : sqrt_grad, sqrt_double_grad (sqrt_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : square
-  backward : square_grad, square_double_grad (square_grad_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : squeeze (squeeze2)
-  backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad)
-  inputs :
-    x : X
-  attrs :
-   axis : axes
-  outputs :
-    {out : Out, xshape : XShape}
-  int_array:
-    axis :
-      data_type : int
-      support_tensor : true
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-    outputs : [xshape]
-
-- op : stack
-  backward : stack_grad
-  inputs :
-    x : X
-  outputs :
-    out : Y
-  extra :
-    attrs : [bool use_mkldnn = false]
-  drop_empty_grad : [x_grad]
-
-- op : stanh
-  backward : stanh_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : strided_slice
-  backward : strided_slice_grad
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-  int_array :
-    starts :
-      data_type : int
-      tensor_name : StartsTensor
-      tensors_name : StartsTensorList
-    ends :
-      data_type : int
-      tensor_name : EndsTensor
-      tensors_name : EndsTensorList
-    strides :
-      data_type : int
-      tensor_name : StridesTensor
-      tensors_name : StridesTensorList
-  manual_signature : [strided_slice, strided_slice_grad]
-  get_expected_kernel_type :
-    strided_slice : GetStridedSliceExpectedKernelType
-    strided_slice_grad : GetStridedSliceGradExpectedKernelType
-
-- op : subtract (elementwise_sub)
-  backward : subtract_grad (elementwise_sub_grad)
-  inputs :
-    {x : X, y: Y}
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
-  complex_promote : [X, Y]
-
-- op : sum (reduce_sum)
-  backward : sum_grad (reduce_sum_grad), sum_double_grad
-  inputs:
-    {x : X}
-  outputs:
-    out : Out
-  attrs:
-    { axis : dim,  keepdim : keep_dim, dtype : out_dtype}
-  extra :
-    attrs : [bool use_mkldnn = false]
-  int_array:
-      axis :
-        data_type : int
-        support_tensor : true
-  get_expected_kernel_type :
-    sum : GetReduceExpectedKernelType
-    sum_grad : GetReduceGradExpectedKernelType
-  manual_signature : [sum]
-
-- op : svd
-  backward : svd_grad
-  inputs :
-    x : X
-  outputs :
-    {u : U, s : S, vh : VH}
-
-- op : swish
-  backward : swish_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, float beta = 1.0]
-
-- op : sync_batch_norm
-  inputs :
-    {x : X, scale : Scale, bias : Bias, mean : Mean, variance : Variance}
-  outputs :
-    {out : Y, mean_out : MeanOut, variance_out : VarianceOut, saved_mean : SavedMean, saved_variance : SavedVariance, reserve_space : ReserveSpace}
-  backward : sync_batch_norm_grad
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
-
-- op : take_along_axis
-  backward : take_along_axis_grad
-  inputs :
-    {arr : Input, indices : Index}
-  outputs :
-    out : Result
-  attrs :
-    axis : Axis
-
-- op : tan
-  backward : tan_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : tanh
-  backward : tanh_grad, tanh_double_grad (tanh_grad_grad), tanh_triple_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : tanh_shrink
-  backward : tanh_shrink_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- op : tdm_sampler
-  inputs:
-    {x : X, travel : Travel, layer : Layer}
-  outputs:
-    {out : Out, labels : Labels, mask : Mask}
-
-- op : thresholded_relu
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : tile
-  backward : tile_grad, tile_double_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  int_array:
-    repeat_times :
-      data_type : int
-      tensor_name : RepeatTimes
-      tensors_name : repeat_times_tensor
-
-- op : topk (top_k_v2)
-  backward : topk_grad (top_k_v2_grad)
-  inputs :
-    x : X
-  outputs :
-    {out : Out, indices : Indices}
-  scalar :
-    k :
-      data_type : int
-      tensor_name : K
-
-- op : trace
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- op : transpose (transpose2)
-  backward : transpose_grad (transpose2_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-  attrs:
-    perm : axis
-  extra :
-    outputs : [XShape]
-    attrs : [bool use_mkldnn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"]
-
-- op : triangular_solve
-  backward : triangular_solve_grad
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : tril_triu
-  backward : tril_triu_grad
-  inputs :
-    {x: X}
-  outputs :
-    {out : Out}
-
-- op : trilinear_interp (trilinear_interp_v2)
-  backward : trilinear_interp_grad (trilinear_interp_v2_grad)
-  inputs :
-    {x : X, out_size : OutSize, size_tensor : SizeTensor, scale_tensor : Scale}
-  outputs :
-    output : Out
-  attrs:
-    data_format: data_layout
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- op : trunc
-  inputs :
-    input : X
-  outputs :
-    out : Out
-
-- op : truncated_gaussian_random
-  outputs :
-    out : Out
-
-- op : unbind
-  inputs :
-    input : X
-  outputs :
-    out : Out
-
-- op : unfold
-  inputs :
-    x : X
-  outputs :
-    out : Y
-
-- op : uniform (uniform_random)
-  outputs :
-    out : Out
-  int_array :
-    shape :
-      data_type : int64_t
-      tensor_name : ShapeTensor
-      tensors_name : ShapeTensorList
-  scalar :
-      min :
-        data_type : float
-        support_tensor : true
-      max :
-        data_type : float
-        support_tensor : true
-  manual_signature : [uniform]
-
-- op : uniform_inplace (uniform_random_inplace)
-  backward : uniform_inplace_grad(uniform_random_inplace_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op : unique
-  inputs :
-    {x : X}
-  outputs :
-    {out : Out, indices : Indices, inverse : Index, counts : Counts}
-  get_expected_kernel_type :
-    unique : GetUniqueExpectedKernelType
-  manual_signature : [unique]
-
-- op : unique_consecutive
-  inputs :
-    x : X
-  outputs :
-    {out : Out, index : Index, counts : Counts}
-
-- op : unpool
-  inputs :
-    {x : X, indices: Indices}
-  outputs :
-    out : Out
-  attrs :
-   padding : paddings
-  int_array :
-    output_size:
-      data_type : int
-      support_tensor : true
-
-- op : unpool3d
-  inputs :
-    {x : X, indices: Indices}
-  outputs :
-    out : Out
-
-- op : unsqueeze (unsqueeze2)
-  backward : unsqueeze_grad (unsqueeze2_grad), unsqueeze_double_grad(unsqueeze2_double_grad)
-  inputs :
-    x : X
-  attrs :
-   axis : axes
-  outputs :
-    {out : Out, xshape : XShape}
-  int_array:
-    axis :
-      data_type : int
-      tensor_name : AxesTensor
-      tensors_name : AxesTensorList
-  extra :
-    outputs : [xshape]
-
-- op : unstack
-  backward : unstack_grad
-  inputs :
-    x : X
-  outputs :
-    out : Y
-
-- op : update_loss_scaling_(update_loss_scaling)
-  inputs :
-    {x : X, found_infinite : FoundInfinite, prev_loss_scaling : PrevLossScaling, in_good_steps : InGoodSteps, in_bad_steps : InBadSteps}
-  outputs :
-    {out : Out, loss_scaling : LossScaling, out_good_steps : OutGoodSteps, out_bad_steps : OutBadSteps}
-  scalar :
-    stop_update :
-      data_type : bool
-      tensor_name : StopUpdate
-  get_expected_kernel_type :
-    update_loss_scaling_ : GetUpdateLossScalingExpectedKernelType
-
-- op : viterbi_decode
-  inputs :
-    {potentials : Input, transition_params : Transition, lengths : Length}
-  outputs :
-    {scores : Scores, path : Path}
-
-- op : warpctc
-  backward : warpctc_grad
-  inputs :
-    {logits : Logits, label : Label, logits_length : LogitsLength, labels_length : LabelLength}
-  outputs :
-    {warpctcgrad : WarpCTCGrad, loss : Loss}
-
-- op : where
-  backward : where_grad
-  inputs :
-    {condition : Condition, x : X, y : Y}
-  outputs :
-    out : Out
-
-- op : while
-  backward : while_grad
-  extra :
-    attrs : ['str[] skip_eager_deletion_vars = {}']
-
-- op : yolo_box
-  inputs :
-    {x : X, img_size : ImgSize}
-  outputs :
-    {boxes : Boxes, scores : Scores}
-
-- op : yolo_loss (yolov3_loss)
-  backward: yolo_loss_grad (yolov3_loss_grad)
-  inputs :
-    {x : X,  gt_box : GTBox, gt_label : GTLabel ,gt_score : GTScore}
-  outputs :
-    {loss : Loss , objectness_mask : ObjectnessMask, gt_match_mask : GTMatchMask}
-  get_expected_kernel_type :
-    yolo_loss : GetYoloLossExpectedKernelType
-    yolo_loss_grad : GetYoloLossExpectedKernelType
-
-- op: c_allgather
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_allreduce_max
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_allreduce_min
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_allreduce_prod
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_allreduce_sum
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_broadcast
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: c_identity
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_reduce_min
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_reduce_sum
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_reducescatter
-  inputs :
-    x : X
-  outputs :
-    out: Out
-
-- op: c_sync_calc_stream
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: c_sync_comm_stream
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: channel_shuffle
-  inputs:
-    {x: X}
-  outputs:
-    {out: Out}
-
-- op: decayed_adagrad
-  inputs:
-    {param : Param, grad : Grad, moment : Moment, learning_rate : LearningRate}
-  outputs:
-    {param_out : ParamOut, moment_out : MomentOut}
-
-- op: distribute_fpn_proposals
-  inputs :
-    {fpn_rois: FpnRois, rois_num: RoisNum}
-  outputs :
-    multi_fpn_rois : MultiFpnRois
-    multi_level_rois_num: MultiLevelRoIsNum
-    restore_index: RestoreIndex
-
-- op: distributed_lookup_table
-  inputs:
-    {ids: Ids, w: W}
-  outputs:
-    outputs: Outputs
-
-- op: dpsgd
-  inputs:
-    {param: Param,grad: Grad,learning_rate: LearningRate}
-  outputs:
-    param_out : ParamOut
-
-- op: fetch (fetch_v2)
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- op: ftrl
-  inputs:
-    {param: Param, squared_accumulator: SquaredAccumulator, linear_accumulator: LinearAccumulator, grad: Grad, learning_rate: LearningRate}
-  outputs:
-    {param_out: ParamOut, squared_accum_out: SquaredAccumOut, linear_accum_out: LinearAccumOut}
-
-- op: full_batch_size_like (fill_constant_batch_size_like)
-  inputs:
-    {input: Input}
-  outputs:
-    {out: Out}
-
-- op: fused_elemwise_add_activation
-  backward: fused_elemwise_add_activation_grad
-  inputs :
-    {x: X, y: Y}
-  outputs :
-    {out : Out, intermediate_out : IntermediateOut}
-
-- op: fusion_squared_mat_sub
-  inputs :
-    x : X
-    y : Y
-  outputs :
-    squared_x : SquaredX
-    squared_y : SquaredY
-    squared_xy : SquaredXY
-    out : Out
-
-- op: get_tensor_from_selected_rows
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: identity_loss
-  inputs :
-    x: X
-  outputs :
-    out : Out
-
-- op: lars_momentum
-  inputs:
-    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
-  outputs :
-    {param_out: ParamOut, velocity_out: VelocityOut, master_param_out: MasterParamOut}
-
-- op: lod_array_length
-  inputs :
-    {x: X}
-  outputs :
-    out : Out
-
-- op: logspace
-  inputs:
-    {start: Start, stop: Stop, num: Num, base: Base}
-  outputs:
-    {out: Out}
-
-- op: lu
-  backward: lu_grad
-  inputs:
-    x: X
-  outputs:
-    {out: Out, pivots : Pivots, infos : Infos}
-  attrs:
-    pivot : pivots
-
-- op: match_matrix_tensor
-  backward: match_matrix_tensor_grad
-  inputs:
-    {x : X, y : Y, w : W}
-  outputs:
-    {out : Out, tmp : Tmp}
-
-- op: memcpy
-  inputs:
-    x: X
-  outputs:
-    out: Out
-
-- op: memcpy_d2h
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: nce
-  backward: nce_grad
-  inputs:
-    {input : Input, label : Label, weight : Weight, bias : Bias, sample_weight : SampleWeight, custom_dist_probs : CustomDistProbs, custom_dist_alias : CustomDistAlias, custom_dist_alias_probs : CustomDistAliasProbs}
-  outputs:
-    {cost : Cost, sample_logits : SampleLogits, sample_labels : SampleLabels}
-
-- op: number_count
-  inputs :
-    {numbers: numbers}
-  outputs :
-    out : Out
-
-- op: read_from_array
-  inputs:
-    array : X
-    i : I
-  outputs :
-    out : Out
-
-- op: recv_v2
-  outputs :
-    out : Out
-
-- op: reindex_graph (graph_reindex)
-  inputs :
-    {x : X, neighbors : Neighbors, count : Count, hashtable_value : HashTable_Value, hashtable_index : HashTable_Index}
-  outputs :
-    {reindex_src : Reindex_Src, reindex_dst : Reindex_Dst, out_nodes : Out_Nodes}
-
-- op: rrelu
-  inputs:
-    {x: X}
-  outputs:
-    {out: Out, noise: Noise}
-
-- op: send_v2
-  inputs :
-    x : X
-
-- op: set_value
-  backward: set_value_grad
-  inputs:
-    x : Input
-  outputs:
-    out: Out
-  int_array:
-    starts:
-      data_type : int64_t
-      tensors_name : StartsTensorList
-    ends:
-      data_type : int64_t
-      tensors_name : EndsTensorList
-    steps:
-      data_type : int64_t
-      tensors_name : StepsTensorList
-
-- op: set_value_with_tensor
-  backward: set_value_grad
-  inputs:
-    x : Input
-  outputs:
-    out: Out
-  int_array:
-    starts:
-      data_type : int64_t
-      tensors_name : StartsTensorList
-    ends:
-      data_type : int64_t
-      tensors_name : EndsTensorList
-    steps:
-      data_type : int64_t
-      tensors_name : StepsTensorList
-
-- op: share_data
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: sigmoid_cross_entropy_with_logits
-  backward: sigmoid_cross_entropy_with_logits_grad
-  inputs :
-    {x: X, label: Label}
-  outputs :
-    out : Out
-
-- op: skip_layernorm
-  inputs :
-    {x: X, y: Y, scale: Scale, bias : Bias}
-  outputs :
-    out : Out
-
-- op: sparse_momentum
-  inputs :
-    {param: Param, grad: Grad, velocity: Velocity, index: Index, axis: Axis, learning_rate: LearningRate,master_param: MasterParam}
-  outputs :
-    {param_out: ParamOut, velocity_out: VelocityOut, master_param_out: MasterParamOut}
-  scalar:
-   axis:
-     datatype : int
-     tensor_name : Axis
-
-- op: squared_l2_norm
-  backward: squared_l2_norm_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: temporal_shift
-  backward: temporal_shift_grad
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- op: uniform_random_batch_size_like
-  inputs:
-     input : Input
-  outputs:
-     out: Out
-
-- op: write_to_array
-  inputs :
-    {x: X, i: I}
-  outputs :
-    out : Out

From 18afa444bac99cf42d3757de3a75c1f1a89d2c35 Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Wed, 7 Feb 2024 07:37:33 +0000
Subject: [PATCH 4/9] refine

---
 paddle/fluid/framework/new_executor/feed_fetch_utils.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
index 99829de387c321..f82350ec6d103f 100644
--- a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
+++ b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
@@ -115,6 +115,7 @@ void FetchTensors(const std::vector<std::string>& job_fetch_names,
         &(PADDLE_GET(phi::DenseTensor, fetch_list->at(micro_batch_id)[col]));
     if (src.IsInitialized()) {
       TensorCopy(src, platform::CPUPlace(), dst);
+      dst->set_lod(src.lod());
     } else {
       VLOG(6) << "Found " << var_name
               << " is not initialized and skip TensorCopy.";

From 687744caac602b536a80a77f6bd22fc0e831cecb Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Thu, 8 Feb 2024 02:44:39 +0000
Subject: [PATCH 5/9] refine

---
 test/mkldnn/test_layer_norm_bf16_mkldnn_op.py | 10 +++----
 test/mkldnn/test_layer_norm_mkldnn_op.py      | 26 +++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py b/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
index 5711ad4436acea..96dd1f818b1239 100644
--- a/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
@@ -23,7 +23,7 @@
     TestLayerNormMKLDNNOp,
     _reference_layer_norm_naive,
 )
-from utils import compare_legacy_with_pt
+from utils import pir_executor_guard
 
 from paddle import base, enable_static
 from paddle.base import core
@@ -133,11 +133,11 @@ def check_forward(
                 self.__assert_close(mean, out[1], "mean")
                 self.__assert_close(variance, out[2], "variance", 1e-3)
 
-    @compare_legacy_with_pt
     def test_check_forward_with_is_test(self):
-        self.check_forward(
-            shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True
-        )
+        with pir_executor_guard():
+            self.check_forward(
+                shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True
+            )
 
     # TODO (jczaja): Enable those to test when enabling training using bf16
     def test_check_forward_with_scale_and_bias(self):
diff --git a/test/mkldnn/test_layer_norm_mkldnn_op.py b/test/mkldnn/test_layer_norm_mkldnn_op.py
index c53687872307c6..d2ba6062ffe6bb 100644
--- a/test/mkldnn/test_layer_norm_mkldnn_op.py
+++ b/test/mkldnn/test_layer_norm_mkldnn_op.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 from op_test import OpTestTool, _set_use_system_allocator
-from utils import compare_legacy_with_pt
+from utils import pir_executor_guard
 
 from paddle import base, enable_static
 from paddle.base import core
@@ -144,25 +144,25 @@ def check_forward(
                 self.__assert_close(variance, out[2], "variance", 1e-3)
 
     @OpTestTool.skip_if_not_cpu_bf16()
-    @compare_legacy_with_pt
     def test_check_forward_non_last_begin_norm_axis(self):
-        self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=2)
+        with pir_executor_guard():
+            self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=2)
 
-    @compare_legacy_with_pt
     def test_check_forward_with_scale_and_bias(self):
-        self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        with pir_executor_guard():
+            self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
-    @compare_legacy_with_pt
     def test_check_forward_without_scale_and_bias(self):
-        self.check_forward(
-            shape=[2, 3, 4, 5], begin_norm_axis=3, with_scale_bias=False
-        )
+        with pir_executor_guard():
+            self.check_forward(
+                shape=[2, 3, 4, 5], begin_norm_axis=3, with_scale_bias=False
+            )
 
-    @compare_legacy_with_pt
     def test_check_forward_with_is_test(self):
-        self.check_forward(
-            shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True
-        )
+        with pir_executor_guard():
+            self.check_forward(
+                shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True
+            )
 
 
 if __name__ == "__main__":

From 96654f1bcbb6c4ed792a9477fa4498eb8ad3cd55 Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Sun, 18 Feb 2024 09:01:13 +0000
Subject: [PATCH 6/9] refine

---
 paddle/fluid/operators/CMakeLists.txt         |   4 -
 .../operators/mkldnn/layer_norm_mkldnn_op.cc  | 152 ------------------
 paddle/fluid/operators/unity_build_rule.cmake |   2 -
 .../fluid/pir/dialect/operator/utils/utils.cc |   1 -
 .../cpu/onednn_to_paddle_layout_kernel.cc     |   4 +
 .../phi/kernels/onednn/layer_norm_kernel.cc   | 147 +++++++++++++++++
 6 files changed, 151 insertions(+), 159 deletions(-)
 delete mode 100644 paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
 create mode 100644 paddle/phi/kernels/onednn/layer_norm_kernel.cc

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 423638426f7fd8..5d03c833a87c7e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -106,10 +106,6 @@ if (WITH_GPU OR WITH_ROCM)
     register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS})
 endif()
 
-if (WITH_MKLDNN)
-    register_mkldnn_kernel(layer_norm_op SRCS layer_norm_mkldnn_op.cc DEPS ${OP_HEADER_DEPS})
-endif()
-
 if (WITH_GPU OR WITH_ROCM)
     op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
 elseif (WITH_XPU_KP)
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
deleted file mode 100644
index 1f700c0630b1d6..00000000000000
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/phi/backends/onednn/onednn_reuse.h"
-#include "paddle/phi/common/data_type.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class LayerNormOneDNNHandler
-    : public phi::funcs::
-          OneDNNHandlerNoCachingT<T, dnnl::layer_normalization_forward> {
- public:
-  LayerNormOneDNNHandler(const std::vector<int64_t>& dims,
-                         const float& epsilon,
-                         const dnnl::normalization_flags& flags,
-                         const bool& is_test,
-                         const phi::DenseTensor* x,
-                         const dnnl::engine engine,
-                         platform::Place cpu_place)
-      : phi::funcs::OneDNNHandlerNoCachingT<T,
-                                            dnnl::layer_normalization_forward>(
-            engine, cpu_place) {
-    const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
-                                       : dnnl::prop_kind::forward_training;
-
-    this->AcquireForwardPrimitiveDescriptor(
-        fwd_prop_kind, x->mem_desc(), x->mem_desc(), epsilon, flags);
-  }
-
-  std::tuple<std::shared_ptr<dnnl::memory>, std::shared_ptr<dnnl::memory>>
-  AcquireScaleShiftMemory(const phi::DenseTensor* scale,
-                          const phi::DenseTensor* shift) {
-    auto scale_memory = this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->weights_desc(),
-        phi::funcs::to_void_cast<float>(scale->data<float>()));
-    auto shift_memory = this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->weights_desc(),
-        phi::funcs::to_void_cast<float>(shift->data<float>()));
-
-    return std::make_tuple(scale_memory, shift_memory);
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor* mean) {
-    float* mean_data = mean->mutable_data<float>(
-        this->place_, this->fwd_pd_->mean_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
-                                            mean_data);
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
-      phi::DenseTensor* variance) {
-    float* variance_data = variance->mutable_data<float>(
-        this->place_, this->fwd_pd_->variance_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
-                                            variance_data);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Y");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    const float epsilon = ctx.Attr<float>("epsilon");
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    auto& dev_ctx = ctx.template device_context<phi::OneDNNContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto src_tz = common::vectorize(x->dims());
-    PADDLE_ENFORCE_EQ(begin_norm_axis,
-                      (src_tz.size() - 1),
-                      platform::errors::InvalidArgument(
-                          "MKL-DNN Layer Norm supports only last logical "
-                          "axis:%d as begin_norm_axis.",
-                          (src_tz.size() - 1)));
-
-    const bool with_scaleshift = (scale && bias);
-    dnnl::normalization_flags flags{};
-
-    if (with_scaleshift) {
-      flags |= dnnl::normalization_flags::use_scale |
-               dnnl::normalization_flags::use_shift;
-    }
-
-    LayerNormOneDNNHandler<T> handler(
-        src_tz, epsilon, flags, is_test, x, onednn_engine, ctx.GetPlace());
-
-    auto src_memory = handler.AcquireSrcMemory(x);
-    auto dst_memory = handler.AcquireDstMemory(out);
-
-    auto layer_norm_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = phi::OneDNNContext::tls().get_stream();
-    std::unordered_map<int, dnnl::memory> args = {{DNNL_ARG_SRC, *src_memory},
-                                                  {DNNL_ARG_DST, *dst_memory}};
-
-    if (!is_test) {
-      auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-      auto* var = ctx.Output<phi::DenseTensor>("Variance");
-
-      auto mean_memory = handler.AcquireMeanMemory(mean);
-      auto variance_memory = handler.AcquireVarianceMemory(var);
-
-      args.insert({DNNL_ARG_MEAN, *mean_memory});
-      args.insert({DNNL_ARG_VARIANCE, *variance_memory});
-    }
-
-    if (with_scaleshift) {
-      auto scaleshift_mems = handler.AcquireScaleShiftMemory(scale, bias);
-      args.insert({DNNL_ARG_SCALE, *(std::get<0>(scaleshift_mems))});
-      args.insert({DNNL_ARG_SHIFT, *(std::get<1>(scaleshift_mems))});
-    }
-
-    layer_norm_p->execute(astream, args);
-    astream.wait();
-
-    out->set_mem_desc(dst_memory->get_desc());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(layer_norm,
-                          OneDNN,
-                          ONEDNN,
-                          ops::LayerNormMKLDNNOpKernel,
-                          float,
-                          paddle::platform::bfloat16) {}
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 2e1b6f86d6370c..07136f7bd4f310 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -131,8 +131,6 @@ register_unity_group(
   l1_norm_op.cc
   label_smooth_op.cc
   generated_op
-  mkldnn/layer_norm_mkldnn_op.cc
-  mkldnn/layer_norm_mkldnn_op.cc
   linspace_op.cc
   load_combine_op.cc
   load_op.cc)
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 832c7856788d06..c0f88cc3dc4b9a 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -79,7 +79,6 @@ const std::unordered_set<std::string> LegacyOpList = {
     paddle::onednn::dialect::LrnGradOp::name(),
     paddle::onednn::dialect::QuantizeOp::name(),
     paddle::onednn::dialect::RequantizeOp::name(),
-    paddle::onednn::dialect::LayerNormOp::name(),
 #endif
     CReduceMinOp::name(),
     PushSparseV2Op::name()};
diff --git a/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc b/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
index f9324ca8b3e5f6..78855ef37d9c4a 100644
--- a/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
+++ b/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
@@ -64,6 +64,10 @@ void OneDNN2PaddleLayout(const Context& dev_ctx,
   VLOG(4) << "src_layout: " << src_layout << ", tmp_layout: " << tmp_layout;
 
   if (src_layout != DataLayout::ONEDNN || !x.storage_properties_initialized()) {
+    if (!x.IsInitialized()) {
+      out->Resize(x.dims());
+      out->set_layout(tmp_layout);
+    }
     out->ShareDataWith(x);
     out->ShareInplaceVersionCounterWith(x);
     out->set_layout(static_cast<DataLayout>(tmp_layout));
diff --git a/paddle/phi/kernels/onednn/layer_norm_kernel.cc b/paddle/phi/kernels/onednn/layer_norm_kernel.cc
new file mode 100644
index 00000000000000..ff700cd11530fc
--- /dev/null
+++ b/paddle/phi/kernels/onednn/layer_norm_kernel.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/layer_norm_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+class LayerNormOneDNNHandler
+    : public phi::funcs::
+          OneDNNHandlerNoCachingT<T, dnnl::layer_normalization_forward> {
+ public:
+  LayerNormOneDNNHandler(const std::vector<int64_t>& dims,
+                         const float& epsilon,
+                         const dnnl::normalization_flags& flags,
+                         const bool& is_test,
+                         const phi::DenseTensor* x,
+                         const dnnl::engine engine,
+                         Place cpu_place)
+      : phi::funcs::OneDNNHandlerNoCachingT<T,
+                                            dnnl::layer_normalization_forward>(
+            engine, cpu_place) {
+    const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
+                                       : dnnl::prop_kind::forward_training;
+
+    this->AcquireForwardPrimitiveDescriptor(
+        fwd_prop_kind, x->mem_desc(), x->mem_desc(), epsilon, flags);
+  }
+
+  std::tuple<std::shared_ptr<dnnl::memory>, std::shared_ptr<dnnl::memory>>
+  AcquireScaleShiftMemory(const phi::DenseTensor* scale,
+                          const phi::DenseTensor* shift) {
+    auto scale_memory = this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(),
+        phi::funcs::to_void_cast<float>(scale->data<float>()));
+    auto shift_memory = this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(),
+        phi::funcs::to_void_cast<float>(shift->data<float>()));
+
+    return std::make_tuple(scale_memory, shift_memory);
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor* mean) {
+    float* mean_data = mean->mutable_data<float>(
+        this->place_, this->fwd_pd_->mean_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
+                                            mean_data);
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
+      phi::DenseTensor* variance) {
+    float* variance_data = variance->mutable_data<float>(
+        this->place_, this->fwd_pd_->variance_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
+                                            variance_data);
+  }
+};
+
+template <typename T, typename Context>
+void LayerNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& scale_opt,
+                     const paddle::optional<DenseTensor>& bias_opt,
+                     float epsilon,
+                     int begin_norm_axis,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  bool is_test = dev_ctx.HasDnnAttr("is_test")
+                     ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test"))
+                     : false;
+
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  auto src_tz = common::vectorize(x.dims());
+  PADDLE_ENFORCE_EQ(begin_norm_axis,
+                    (src_tz.size() - 1),
+                    phi::errors::InvalidArgument(
+                        "MKL-DNN Layer Norm supports only last logical "
+                        "axis:%d as begin_norm_axis.",
+                        (src_tz.size() - 1)));
+
+  const bool with_scaleshift = (scale_opt && bias_opt);
+  dnnl::normalization_flags flags{};
+
+  if (with_scaleshift) {
+    flags |= dnnl::normalization_flags::use_scale |
+             dnnl::normalization_flags::use_shift;
+  }
+
+  LayerNormOneDNNHandler<T> handler(
+      src_tz, epsilon, flags, is_test, &x, onednn_engine, dev_ctx.GetPlace());
+
+  auto src_memory = handler.AcquireSrcMemory(&x);
+  auto dst_memory = handler.AcquireDstMemory(y);
+
+  auto layer_norm_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = phi::OneDNNContext::tls().get_stream();
+  std::unordered_map<int, dnnl::memory> args = {{DNNL_ARG_SRC, *src_memory},
+                                                {DNNL_ARG_DST, *dst_memory}};
+
+  if (!is_test) {
+    auto mean_memory = handler.AcquireMeanMemory(mean);
+    auto variance_memory = handler.AcquireVarianceMemory(var);
+
+    args.insert({DNNL_ARG_MEAN, *mean_memory});
+    args.insert({DNNL_ARG_VARIANCE, *variance_memory});
+  }
+
+  if (with_scaleshift) {
+    auto scaleshift_mems = handler.AcquireScaleShiftMemory(scale_opt.get_ptr(),
+                                                           bias_opt.get_ptr());
+    args.insert({DNNL_ARG_SCALE, *(std::get<0>(scaleshift_mems))});
+    args.insert({DNNL_ARG_SHIFT, *(std::get<1>(scaleshift_mems))});
+  }
+
+  layer_norm_p->execute(astream, args);
+  astream.wait();
+
+  y->set_mem_desc(dst_memory->get_desc());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(layer_norm,
+                   OneDNN,
+                   ONEDNN,
+                   phi::LayerNormKernel,
+                   float,
+                   phi::dtype::bfloat16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
+}

From 691f50722818ba7931bcf7bbba416f5707ed696b Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Sun, 18 Feb 2024 09:08:49 +0000
Subject: [PATCH 7/9] refine

---
 paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc b/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
index 78855ef37d9c4a..f9257ebcddc36f 100644
--- a/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
+++ b/paddle/phi/kernels/cpu/onednn_to_paddle_layout_kernel.cc
@@ -67,6 +67,7 @@ void OneDNN2PaddleLayout(const Context& dev_ctx,
     if (!x.IsInitialized()) {
       out->Resize(x.dims());
       out->set_layout(tmp_layout);
+      return;
     }
     out->ShareDataWith(x);
     out->ShareInplaceVersionCounterWith(x);

From 4ded2b0f9d523e0d487611fe1956652049cac6d1 Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Mon, 19 Feb 2024 06:18:22 +0000
Subject: [PATCH 8/9] refine

---
 .../phi/kernels/onednn/layer_norm_kernel.cc   | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/paddle/phi/kernels/onednn/layer_norm_kernel.cc b/paddle/phi/kernels/onednn/layer_norm_kernel.cc
index ff700cd11530fc..02aa5298b23261 100644
--- a/paddle/phi/kernels/onednn/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/onednn/layer_norm_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -54,17 +54,18 @@ class LayerNormOneDNNHandler
     return std::make_tuple(scale_memory, shift_memory);
   }
 
-  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor* mean) {
-    float* mean_data = mean->mutable_data<float>(
-        this->place_, this->fwd_pd_->mean_desc().get_size());
+  std::shared_ptr<dnnl::memory> AcquireMeanMemory(const OneDNNContext& dev_ctx,
+                                                  phi::DenseTensor* mean) {
+    float* mean_data = dev_ctx.template Alloc<float>(
+        mean, this->fwd_pd_->mean_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
                                             mean_data);
   }
 
   std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
-      phi::DenseTensor* variance) {
-    float* variance_data = variance->mutable_data<float>(
-        this->place_, this->fwd_pd_->variance_desc().get_size());
+      const OneDNNContext& dev_ctx, phi::DenseTensor* variance) {
+    float* variance_data = dev_ctx.template Alloc<float>(
+        variance, this->fwd_pd_->variance_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
                                             variance_data);
   }
@@ -115,8 +116,8 @@ void LayerNormKernel(const Context& dev_ctx,
                                                 {DNNL_ARG_DST, *dst_memory}};
 
   if (!is_test) {
-    auto mean_memory = handler.AcquireMeanMemory(mean);
-    auto variance_memory = handler.AcquireVarianceMemory(var);
+    auto mean_memory = handler.AcquireMeanMemory(dev_ctx, mean);
+    auto variance_memory = handler.AcquireVarianceMemory(dev_ctx, var);
 
     args.insert({DNNL_ARG_MEAN, *mean_memory});
     args.insert({DNNL_ARG_VARIANCE, *variance_memory});

From 360f9c7ae128ed0a75fb6885b143d8438074c655 Mon Sep 17 00:00:00 2001
From: Wang Huan <wanghuan29@baidu.com>
Date: Mon, 19 Feb 2024 06:22:43 +0000
Subject: [PATCH 9/9] refine

---
 paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index ef1ec30fb5e222..828db3ee7e0b64 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -50,7 +50,7 @@
 #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
 
 #ifdef PADDLE_WITH_DNNL
-#include "build/paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/trait/onednn.h"