diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index c01df4d6e236c7..6e1ec454b6bab3 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -3113,6 +3113,25 @@ struct CEmbeddingOpTranscriber : public OpTranscriber {
   }
 };
 
+struct QuantizeLinearOpTranscriber : public OpTranscriber {
+  void HandleNonexistentAttribute(pir::IrContext* ctx,
+                                  pir::AttributeMap* attribute_map,
+                                  const OpAttributeInfo& info) override {
+    if (info.name == "round_type") {
+      (*attribute_map)[info.name] = pir::Int32Attribute::get(ctx, 0);
+    }
+    if (info.name == "is_test") {
+      (*attribute_map)[info.name] = pir::BoolAttribute::get(ctx, true);
+    }
+    if (info.name == "only_observer") {
+      (*attribute_map)[info.name] = pir::BoolAttribute::get(ctx, false);
+    }
+    if (info.name == "moving_rate") {
+      (*attribute_map)[info.name] = pir::FloatAttribute::get(ctx, 0.9);
+    }
+  }
+};
+
 OpTranslator::OpTranslator() {
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
@@ -3185,6 +3204,8 @@ OpTranslator::OpTranslator() {
   special_handlers["elementwise_mod_grad"] = ElementwiseGradTranscriber();
   special_handlers["elementwise_floordiv_grad"] = ElementwiseGradTranscriber();
   special_handlers["c_embedding"] = CEmbeddingOpTranscriber();
+  special_handlers["quantize_linear"] = QuantizeLinearOpTranscriber();
+  special_handlers["dequantize_linear"] = QuantizeLinearOpTranscriber();
 }
 
 }  // namespace translator
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index b141f1ecfa8792..54b56a2e3c8875 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -108,6 +108,12 @@
     'lrn',
     'multi_gru',
     'matmul_with_flatten',
+    'moving_average_abs_max_scale',
+    'moving_average_abs_max_scale_',
+    'quantize_linear',
+    'quantize_linear_',
+    'dequantize_linear',
+    'dequantize_linear_',
 ]
 
 NO_NEED_GEN_STATIC_ONLY_APIS = [
diff --git a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc
index 44183751b8ca1d..54692375248809 100644
--- a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.cc
@@ -28,6 +28,10 @@ KernelKeyTuple UniqueOpParseKernelKey(pir::Operation* op) {
   return {dtype, backend};
 }
 
+KernelKeyTuple SaveCombineOpParseKernelKey(pir::Operation* op) {
+  return {phi::DataType::FLOAT32, phi::Backend::UNDEFINED};
+}
+
 }  // namespace paddle::dialect
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ParseKernelKeyInterface)
diff --git a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h
index 2d101dbd310d5e..7913893fdb7d7a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h
+++ b/paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h
@@ -57,6 +57,8 @@ class ParseKernelKeyInterface
 // Register the ParseKernelKeyInterface for unique op.
 KernelKeyTuple UniqueOpParseKernelKey(pir::Operation *op);
 
+KernelKeyTuple SaveCombineOpParseKernelKey(pir::Operation *op);
+
 }  // namespace dialect
 }  // namespace paddle
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index f1e20326d59dee..594130926d5695 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -366,6 +366,19 @@
     data_type : x
   backward : depthwise_conv2d_transpose_grad
 
+- op : dequantize_linear
+  args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false, float moving_rate=0.9f)
+  output : Tensor(y), Tensor(out_scale), Tensor(out_accum), Tensor(out_state)
+  infer_meta :
+    func : QuantizeLinearInferMeta
+    param : [x, scale, in_accum, in_state, quant_axis]
+  kernel :
+    func : quantize_linear
+    param : [x, scale, zero_point, in_accum, in_state, quant_axis, bit_length, round_type, is_test, only_observer, moving_rate]
+    data_type : x
+  optional : in_accum, in_state, out_scale, out_accum, out_state
+  inplace : (scale -> out_scale, in_accum -> out_accum, in_state -> out_state)
+
 - op : disable_check_model_nan_inf
   args: (Tensor x, int flag = 0)
   output: Tensor(out)
@@ -1083,6 +1096,19 @@
     data_type : out_grad_in
   inplace: (out_grad_in -> out_grad_out)
 
+- op : quantize_linear
+  args : (Tensor x, Tensor scale, Tensor zero_point, Tensor in_accum, Tensor in_state, int quant_axis = 0, int bit_length = 8, int round_type = 0, bool is_test = true, bool only_observer = false, float moving_rate=0.9f)
+  output : Tensor(y), Tensor(out_scale), Tensor(out_accum), Tensor(out_state)
+  infer_meta :
+    func : QuantizeLinearInferMeta
+    param : [x, scale, in_accum, in_state, quant_axis]
+  kernel :
+    func : quantize_linear
+    param : [x, scale, zero_point, in_accum, in_state, quant_axis, bit_length, round_type, is_test, only_observer, moving_rate]
+    data_type : x
+  optional : in_accum, in_state, out_scale, out_accum, out_state
+  inplace : (scale -> out_scale, in_accum -> out_accum, in_state -> out_state)
+
 - op : randint
   args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={})
   output : Tensor(out)
@@ -1215,6 +1241,7 @@
     func: save_combine_tensor
     param: [x, file_path, overwrite, save_as_fp16, save_to_memory]
   optional : out
+  interfaces : paddle::dialect::ParseKernelKeyInterface
 
 - op : seed
   args : (int seed, bool deterministic, str rng_name, bool force_cpu)
@@ -1635,6 +1662,18 @@
      func: match_matrix_tensor
   backward: match_matrix_tensor_grad
 
+- op: moving_average_abs_max_scale
+  args: (Tensor x, Tensor in_accum, Tensor in_state, float moving_rate=0.9f, bool is_test=false)
+  output: Tensor(out), Tensor(out_scale), Tensor(out_state), Tensor(out_accum)
+  infer_meta:
+    func: MovingAverageAbsMaxScaleInferMeta
+    param: [x, in_accum, in_state]
+  kernel:
+    func: moving_average_abs_max_scale
+    param: [x, in_accum, in_state, moving_rate, is_test]
+  optional : in_accum, in_state, out, out_state, out_accum
+  inplace : (in_accum -> out_accum), (in_state -> out_state)
+
 - op: nce
   args: (Tensor input, Tensor label, Tensor weight, Tensor bias, Tensor sample_weight, Tensor custom_dist_probs, Tensor custom_dist_alias, Tensor custom_dist_alias_probs, int num_total_classes, int[] custom_neg_classes={}, int num_neg_samples=10, int sampler=0, int seed=0, bool is_sparse=false, bool remote_prefetch=false, bool is_test=false)
   output: Tensor(cost), Tensor(sample_logits), Tensor(sample_labels)
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index b4bad427567b70..296dec7d51940e 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -72,6 +72,12 @@ const std::unordered_set<std::string> LegacyOpList = {
     NceGradOp::name(),
     LrnOp::name(),
     LrnGradOp::name(),
+    MovingAverageAbsMaxScaleOp::name(),
+    MovingAverageAbsMaxScale_Op::name(),
+    QuantizeLinearOp::name(),
+    QuantizeLinear_Op::name(),
+    DequantizeLinearOp::name(),
+    DequantizeLinear_Op::name(),
 #ifdef PADDLE_WITH_DNNL
     paddle::onednn::dialect::LrnOp::name(),
     paddle::onednn::dialect::LrnGradOp::name(),
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 53e0cea953b879..1a3f86753fa7e9 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -771,6 +771,10 @@
     {scale : Scale, shift : Shift}
 
 - op : dequantize_linear
+  inputs :
+    {x : X, scale : Scale, zero_point : ZeroPoint, in_accum : InAccum, in_state : InState}
+  outputs :
+    {y : Y, out_scale : OutScale, out_accum : OutAccum, out_state : OutState}
   extra :
     attrs : [float moving_rate = 0.9]
 
@@ -2197,6 +2201,12 @@
   outputs :
     {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
 
+- op : moving_average_abs_max_scale
+  inputs :
+    {x : X, in_accum : InAccum, in_state : InState}
+  outputs :
+    {out : Out, out_scale : OutScale, out_state : OutState, out_accum : OutAccum}
+
 - op : multi_dot
   backward : multi_dot_grad
   inputs :
@@ -2546,6 +2556,10 @@
     {scale : Scale, shift : Shift, include_self: Include_self}
 
 - op : quantize_linear
+  inputs :
+    {x : X, scale : Scale, zero_point : ZeroPoint, in_accum : InAccum, in_state : InState}
+  outputs :
+    {y : Y, out_scale : OutScale, out_accum : OutAccum, out_state : OutState}
   extra :
     attrs : [float moving_rate = 0.9]
 
diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc
index 07b05ccd2c760c..4a6888fad7d86a 100644
--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -119,7 +119,7 @@ const AttrType& KernelContext::AttrAt(size_t idx) const {
     return paddle::get<AttrType>(attrs_.at(idx));
   } catch (paddle::bad_variant_access const& ex) {
     PADDLE_THROW(phi::errors::InvalidArgument(
-        "Attribute cast error in Op Kernel Context."));
+        "Attribute %d cast error in Op Kernel Context.", idx));
   }
 }
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index b7a5dd51de9015..69214508ef3f92 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -3497,6 +3497,32 @@ void PsroiPoolInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void QuantizeLinearInferMeta(const MetaTensor& x,
+                             const MetaTensor& scale,
+                             const MetaTensor& in_accum,
+                             const MetaTensor& in_state,
+                             int quant_axis,
+                             MetaTensor* y,
+                             MetaTensor* out_scale,
+                             MetaTensor* out_accum,
+                             MetaTensor* out_state) {
+  y->set_dims(x.dims());
+  y->share_lod(x);
+  if (out_scale) {
+    if (quant_axis < 0) {
+      out_scale->set_dims(scale.dims());
+    } else {
+      out_scale->set_dims({x.dims()[quant_axis]});
+    }
+  }
+  if (out_accum) {
+    out_accum->set_dims(in_accum.dims());
+  }
+  if (out_state) {
+    out_state->set_dims(in_state.dims());
+  }
+}
+
 void RmsNormInferMeta(const MetaTensor& x,
                       const MetaTensor& bias,
                       const MetaTensor& residual,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 0774189dd8d4fc..3d9b2539267e7e 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -636,6 +636,16 @@ void PsroiPoolInferMeta(const MetaTensor& x,
                         float spatial_scale,
                         MetaTensor* out);
 
+void QuantizeLinearInferMeta(const MetaTensor& x,
+                             const MetaTensor& scale,
+                             const MetaTensor& in_accum,
+                             const MetaTensor& in_state,
+                             int quant_axis,
+                             MetaTensor* y,
+                             MetaTensor* out_scale,
+                             MetaTensor* out_accum,
+                             MetaTensor* out_state);
+
 void RmsNormInferMeta(const MetaTensor& x,
                       const MetaTensor& bias,
                       const MetaTensor& residual,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index b728c33abf2e2d..edd03e6b07513f 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -896,6 +896,26 @@ void MultiClassNMSInferMeta(const MetaTensor& bboxes,
   nms_rois_num->set_dtype(DataType::INT32);
 }
 
+void MovingAverageAbsMaxScaleInferMeta(const MetaTensor& x,
+                                       const MetaTensor& in_accum,
+                                       const MetaTensor& in_state,
+                                       MetaTensor* out,
+                                       MetaTensor* out_scale,
+                                       MetaTensor* out_state,
+                                       MetaTensor* out_accum) {
+  if (out) {
+    out->set_dims(x.dims());
+    out->share_lod(x);
+    out_scale->set_dims({1});
+  }
+  if (out_state) {
+    out_state->set_dims(in_state.dims());
+  }
+  if (out_accum) {
+    out_accum->set_dims(in_accum.dims());
+  }
+}
+
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
                          const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 7ffdc3d272069f..d12378fe3a92c1 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -140,6 +140,14 @@ void MatchMatrixTensorInferMeta(const MetaTensor& x,
                                 MetaTensor* tmp,
                                 MetaConfig config = MetaConfig());
 
+void MovingAverageAbsMaxScaleInferMeta(const MetaTensor& x,
+                                       const MetaTensor& in_accum,
+                                       const MetaTensor& in_state,
+                                       MetaTensor* out,
+                                       MetaTensor* out_scale,
+                                       MetaTensor* out_state,
+                                       MetaTensor* out_accum);
+
 void MultiClassNMSInferMeta(const MetaTensor& bboxes,
                             const MetaTensor& scores,
                             const MetaTensor& rois_num,
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 87497184392e69..401737bb13ac68 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -257,6 +257,11 @@ def __init__(
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
         paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1})
 
+        if auto_utils.use_new_executor():
+            is_pir_mode = os.environ.get("FLAGS_enable_pir_in_executor", None)
+            if is_pir_mode is None:
+                paddle.framework.set_flags({'FLAGS_enable_pir_in_executor': 1})
+
         self.enable_job_schedule_profiler = False
 
     # get dist input spec from shard dataloader
diff --git a/test/auto_parallel/engine_api.py b/test/auto_parallel/engine_api.py
index cc921d41a74a9d..7edcb9a9823cdf 100644
--- a/test/auto_parallel/engine_api.py
+++ b/test/auto_parallel/engine_api.py
@@ -234,155 +234,6 @@ def train_low_level():
     engine.load(model_filename)
     temp_dir.cleanup()
 
-    # Build dataloader from generator
-    # train
-    train_dataset = MyDataset(batch_num * batch_size)
-    train_dataloader = engine.dataloader_from_generator(
-        train_dataset, batch_size=batch_size, mode="train"
-    )
-    engine.prepare(mode="train")
-    for data in train_dataloader:
-        outs = engine.run(data, feed=feed_dict, mode="train")
-
-    # eval
-    engine.to_mode("eval")
-    eval_dataset2 = MyDataset(batch_size)
-    eval_dataloader = engine.dataloader_from_generator(
-        eval_dataset2, batch_size=batch_size
-    )
-    engine.prepare()
-    for data in eval_dataloader:
-        outs = engine.run(data, feed=feed_dict)
-
-    # predict
-    test_dataset = MyDataset(batch_size)
-    predict_dataloader = engine.dataloader_from_generator(
-        test_dataset, batch_size=batch_size, mode="predict"
-    )
-    engine.prepare(mode="predict")
-    for data in predict_dataloader:
-        outs = engine.run(data, feed=feed_dict, mode="predict")
-
-    # save
-    temp_dir = tempfile.TemporaryDirectory()
-    model_filename = os.path.join(temp_dir.name, 'mlp')
-    engine.save(model_filename, training=True)
-    engine.load(model_filename)
-    temp_dir.cleanup()
-
-
-def train_builtin_data_vars():
-    paddle.distributed.auto_parallel.static.dist_context.set_default_distributed_context(
-        None
-    )
-    mlp = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        dropout_ratio=0.1,
-        initializer_range=0.02,
-    )
-    loss = paddle.nn.CrossEntropyLoss()
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=0.00001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-08,
-        grad_clip=None,
-    )
-    metric = paddle.metric.Accuracy()
-
-    strategy = auto.Strategy()
-    strategy.auto_mode = "semi"
-
-    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
-
-    # train
-    engine.to_mode("train")
-
-    input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input')
-    label_spec = static.InputSpec([batch_size, 1], 'int64', 'label')
-    engine.prepare(inputs_spec=[input_spec], labels_spec=[label_spec])
-
-    with static.program_guard(engine.main_program, engine.startup_program):
-        feed_list = engine.inputs + engine.labels
-        print(feed_list)
-        loader = paddle.base.io.DataLoader.from_generator(
-            feed_list=feed_list, capacity=4 * batch_size, iterable=False
-        )
-
-        places = static.cuda_places()
-        loader.set_batch_generator(batch_generator_creator(), places=places)
-
-    for _ in range(epoch_num):
-        loader.start()  # call DataLoader.start() before each epoch starts
-        try:
-            while True:
-                engine.run()
-        except paddle.base.core.EOFException:
-            loader.reset()  # call DataLoader.reset() after catching EOFException
-
-
-def train_non_builtin_data_vars():
-    paddle.distributed.auto_parallel.static.dist_context.set_default_distributed_context(
-        None
-    )
-    main_program = static.Program()
-    startup_program = static.Program()
-    with static.program_guard(
-        main_program, startup_program
-    ), utils.unique_name.guard():
-        input = static.data(
-            name="input", shape=[batch_size, image_size], dtype='float32'
-        )
-        label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
-
-        loader = paddle.base.io.DataLoader.from_generator(
-            feed_list=[input, label], capacity=4 * batch_size, iterable=False
-        )
-        places = static.cuda_places()
-        loader.set_batch_generator(batch_generator_creator(), places=places)
-
-        mlp = MLPLayer(
-            hidden_size=hidden_size,
-            intermediate_size=4 * hidden_size,
-            dropout_ratio=0.1,
-            initializer_range=0.02,
-        )
-        loss = paddle.nn.CrossEntropyLoss()
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None,
-        )
-        metric = paddle.metric.Accuracy()
-        predict = mlp(input)
-        loss_var = loss(predict, label)
-
-    strategy = auto.Strategy()
-    strategy.auto_mode = "semi"
-
-    engine = auto.Engine(
-        loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy
-    )
-
-    # train
-    engine.to_mode("train")
-    engine.prepare(
-        inputs=[input],
-        labels=[label],
-        main_program=main_program,
-        startup_program=startup_program,
-    )
-    for _ in range(epoch_num):
-        loader.start()  # call DataLoader.start() before each epoch starts
-        try:
-            while True:
-                engine.run()
-        except paddle.base.core.EOFException:
-            loader.reset()  # call DataLoader.reset() after catching EOFException
-
 
 def get_cost():
     paddle.distributed.auto_parallel.static.dist_context.set_default_distributed_context(
@@ -522,8 +373,6 @@ def get_cost_by_spec():
     train_high_level(fetch=True)
     train_high_level(fetch=False)
     train_low_level()
-    train_builtin_data_vars()
-    train_non_builtin_data_vars()
     get_cost()
     get_cost_by_default_program()
     get_cost_by_spec()