From e9021b9747d4c8eef81c1060182cb57370c1d433 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Sun, 28 Jan 2024 08:04:07 +0800
Subject: [PATCH 1/2] Fix

---
 .../fluid/framework/downpour_lite_worker.cc   |  2 +-
 paddle/fluid/framework/downpour_worker.cc     |  2 +-
 paddle/fluid/framework/downpour_worker_opt.cc |  2 +-
 paddle/fluid/framework/fleet/heter_context.h  |  4 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   |  4 +-
 paddle/fluid/framework/hetercpu_worker.cc     |  2 +-
 paddle/fluid/framework/hogwild_worker.cc      |  2 +-
 .../fused_multi_transformer_encoder_pass.cc   | 14 +++---
 .../ir/multihead_matmul_fuse_pass.cc          |  2 +-
 .../ir/multihead_matmul_roformer_fuse_pass.h  |  2 +-
 .../ir/xpu/fast_where_xpu_fuse_pass.cc        |  2 +-
 paddle/fluid/framework/naive_executor.cc      |  2 +-
 paddle/fluid/framework/naive_executor.h       |  2 +-
 .../new_executor/feed_fetch_utils.cc          |  4 +-
 .../control_flow/if_instruction.cc            |  2 +-
 .../instruction/custom_kernel_instruction.cc  |  4 +-
 .../interpreter/dependency_builder.cc         | 34 ++++++-------
 .../interpreter/dependency_builder.h          |  4 +-
 .../new_executor/new_executor_defs.cc         |  2 +-
 .../new_executor/new_executor_defs.h          |  2 +-
 .../framework/new_executor/pir_interpreter.cc | 46 +++++++++---------
 .../framework/new_executor/pir_interpreter.h  | 14 +++---
 .../new_executor/program_interpreter.cc       | 48 +++++++++----------
 .../new_executor/program_interpreter.h        | 14 +++---
 paddle/fluid/framework/parallel_executor.cc   |  4 +-
 paddle/fluid/framework/ps_gpu_worker.cc       |  2 +-
 paddle/fluid/framework/trainer_desc.proto     |  2 +-
 .../fluid/inference/api/analysis_predictor.cc |  4 +-
 paddle/fluid/jit/property.h                   |  2 +-
 python/paddle/base/trainer_desc.py            |  2 +-
 30 files changed, 116 insertions(+), 116 deletions(-)

diff --git a/paddle/fluid/framework/downpour_lite_worker.cc b/paddle/fluid/framework/downpour_lite_worker.cc
index c57ef71ae0342b..3d453c018c1d5f 100644
--- a/paddle/fluid/framework/downpour_lite_worker.cc
+++ b/paddle/fluid/framework/downpour_lite_worker.cc
@@ -117,7 +117,7 @@ void DownpourLiteWorker::Initialize(const TrainerDesc& desc) {
             << dest_table;
     copy_dense_tables_.push_back(std::make_pair(src_table, dest_table));
   }
-  for (auto& m : copy_table_config_.table_denpendency_map()) {
+  for (auto& m : copy_table_config_.table_dependency_map()) {
     if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
       // currently only support one dependency
       for (auto& value : m.values()) {
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index c9bd59f912d7a3..6ce2967a08f1f5 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -116,7 +116,7 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
             << dest_table;
     copy_dense_tables_.emplace_back(src_table, dest_table);
   }
-  for (auto& m : copy_table_config_.table_denpendency_map()) {
+  for (auto& m : copy_table_config_.table_dependency_map()) {
     if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
       // currently only support one dependency
       for (auto& value : m.values()) {
diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc
index d7d8a7ff883cdd..2e3a83251de157 100644
--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -177,7 +177,7 @@ void DownpourWorkerOpt::Initialize(const TrainerDesc& desc) {
             << dest_table;
     copy_dense_tables_.emplace_back(src_table, dest_table);
   }
-  for (auto& m : copy_table_config_.table_denpendency_map()) {
+  for (auto& m : copy_table_config_.table_dependency_map()) {
     if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
       // currently only support one dependency
       for (auto& value : m.values()) {
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 4c5f03d1bb780a..f7cce0ab44940a 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -172,7 +172,7 @@ class HeterContext {
         }
       }
     } else {
-      VLOG(3) << "Reset gpu task with dynamic mf dimention";
+      VLOG(3) << "Reset gpu task with dynamic mf dimension";
       for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
         for (size_t j = 0; j < feature_dim_keys_[i].size(); j++) {
           feature_dim_keys_[i][j].clear();
@@ -262,7 +262,7 @@ class HeterContext {
           threads.push_back(std::thread(unique_dynamic_mf_func, i, j));
         }
       }
-      VLOG(3) << "heter_context unique keys with dynamic mf dimention";
+      VLOG(3) << "heter_context unique keys with dynamic mf dimension";
     }
     for (std::thread& t : threads) {
       t.join();
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index b3e48e0d5b63b9..0399c37d22b689 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -2752,7 +2752,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
       VLOG(3) << "Begin GPUPS PushSparseGrad";
 
       auto buf = memory::Alloc(place, total_length * grad_value_size);
-      VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
+      VLOG(3) << "Push Sparse Max mf dimension: " << max_mf_dim_
               << "grad_value_size:" << grad_value_size;
       float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
 
@@ -2790,7 +2790,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     VLOG(3) << "Begin GPUPS PushSparseGrad";
 
     auto buf = memory::Alloc(place, total_length * grad_value_size);
-    VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
+    VLOG(3) << "Push Sparse Max mf dimension: " << max_mf_dim_
             << "grad_value_size:" << grad_value_size;
     float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
     phi::DenseTensor& total_keys_tensor = keys_tensor[devid_2_index];
diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
index f741baa0f3d2d8..0959b0ae334424 100644
--- a/paddle/fluid/framework/hetercpu_worker.cc
+++ b/paddle/fluid/framework/hetercpu_worker.cc
@@ -180,7 +180,7 @@ void HeterCpuWorker::Initialize(const TrainerDesc& desc) {
             << dest_table;
     copy_dense_tables_.push_back(std::make_pair(src_table, dest_table));
   }
-  for (auto& m : copy_table_config_.table_denpendency_map()) {
+  for (auto& m : copy_table_config_.table_dependency_map()) {
     if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
       // currently only support one dependency
       for (auto& value : m.values()) {
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 5d4b32918f05f8..d95007043dfb54 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -805,7 +805,7 @@ void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
       // depend_builder.Build(ops_, start_index, sharding_mode_);  hbm not safe
       // should run in debug model need to fix
       depend_builder.Build(ops_, start_index, false);
-      new_order = depend_builder.get_new_exexutor_order();
+      new_order = depend_builder.get_new_executor_order();
       std::vector<std::unique_ptr<OperatorBase>> new_ops;
       std::vector<size_t> final_order;
       std::vector<std::string> new_op_names;
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
index e8be50b71917c6..b749a1f282c5b3 100644
--- a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
@@ -1805,7 +1805,7 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph,
     auto* bv_tensor =
         scope->FindVar(eltadd2_b->Name())->GetMutable<phi::DenseTensor>();
 
-    // NOTE(minghaoBD): to make it compatible with strucutured pruning on
+    // NOTE(minghaoBD): to make it compatible with structured pruning on
     // num_head dimension:
     // 1. get dim_head from reshape.shape[3], dim_embed from
     // layer_norm_bias.shape[0]
@@ -1952,7 +1952,7 @@ int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph,
       auto ffn1_in_scale = PADDLE_GET_CONST(
           float, ffn_matmul_1_op->GetAttr("Input_scale_" + ffn1_input_name));
 
-      // Calc outscale and Set them
+      // Calc out scale and Set them
       auto qkv_weight_scale =
           PADDLE_GET_CONST(float, matmul0_op->GetAttr("weight_scale"));
       auto out_weight_scale =
@@ -2629,7 +2629,7 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
     auto* qkv_b_tensor =
         scope->FindVar(eltadd0_b->Name())->GetMutable<phi::DenseTensor>();
 
-    // NOTE(minghaoBD): to make it compatible with strucutured pruning on
+    // NOTE(minghaoBD): to make it compatible with structured pruning on
     // num_head dimension:
     // 1. get dim_head from reshape.shape[3], dim_embed from
     // layer_norm_bias.shape[0]
@@ -2758,9 +2758,9 @@ int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
       auto ffn1_in_scale = PADDLE_GET_CONST(
           float, ffn_matmul_1_op->GetAttr("Input_scale_" + ffn1_input_name));
 
-      // Calc outscale and Set them
+      // Calc out scale and Set them
       // TODO(wufeisheng): Currently just match layer-wise weight scale, where
-      // channel-wise weight scale should also be surpported.
+      // channel-wise weight scale should also be supported.
       auto qkv_weight_scale =
           PADDLE_GET_CONST(float, matmul0_op->GetAttr("weight_scale"));
       auto out_weight_scale =
@@ -4267,7 +4267,7 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
     auto* qkv_b_tensor =
         scope->FindVar(eltadd0_b->Name())->GetMutable<phi::DenseTensor>();
 
-    // NOTE(minghaoBD): to make it compatible with strucutured pruning on
+    // NOTE(minghaoBD): to make it compatible with structured pruning on
     // num_head dimension:
     // 1. get dim_head from reshape.shape[3], dim_embed from
     // layer_norm_bias.shape[0]
@@ -4407,7 +4407,7 @@ int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
       auto ffn1_in_scale = PADDLE_GET_CONST(
           float, ffn_matmul_1_op->GetAttr("Input_scale_" + ffn1_input_name));
 
-      // Calc outscale and Set them
+      // Calc out scale and Set them
       auto qkv_weight_scale =
           PADDLE_GET_CONST(float, matmul0_op->GetAttr("weight_scale"));
       auto out_weight_scale =
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 25f120c7866b50..ebf273a8d1c2ea 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -452,7 +452,7 @@ PDNode* MultiHeadMatmulPattern::operator()() {
 }
 
 PDNode* MultiHeadMatmulV3Pattern::operator()() {
-  // Add mul op to support huggingface onnx model convertsion by x2paddle
+  // Add mul op to support huggingface onnx model conversion by x2paddle
   std::unordered_set<std::string> matmul_ops{"mul", "matmul", "matmul_v2"};
   auto* input0 = pattern->NewNode(input0_repr());
   input0->assert_is_ops_input(matmul_ops);
diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.h
index 4d081e7c3ac780..f43206d3c74cd6 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.h
@@ -24,7 +24,7 @@ namespace framework {
 namespace ir {
 namespace patterns {
 /*
- * \brief   Fuse the subgraph representing multihead attention part of roformer
+ * \brief   Fuse the subgraph representing multi-head attention part of roformer
  * into multihead_matmul_roformer op.
  *
  * \note    The following graph represents this equation:
diff --git a/paddle/fluid/framework/ir/xpu/fast_where_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fast_where_xpu_fuse_pass.cc
index c8c36c7134dade..2723105fa0c466 100644
--- a/paddle/fluid/framework/ir/xpu/fast_where_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fast_where_xpu_fuse_pass.cc
@@ -438,7 +438,7 @@ CascadeFastWhereXPUPattern::CascadeFastWhereXPUPattern(
       pattern->NewNode(fast_where_xpu0_repr())->assert_is_op("fast_where_xpu");
   auto fast_where_xpu1 =
       pattern->NewNode(fast_where_xpu1_repr())->assert_is_op("fast_where_xpu");
-  // declare vairable nodes
+  // declare variable nodes
   auto condition0 = pattern->NewNode(condition0_repr())
                         ->assert_is_op_input("fast_where_xpu", "condition");
   auto condition1 = pattern->NewNode(condition1_repr())
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 90f5b93dcb2efa..5dae6c1c845148 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -320,7 +320,7 @@ void NaiveExecutor::ResetTrtOps(int num) {
 #endif
 }
 
-void NaiveExecutor::CloneLiteEnigne(int num, void *stream) {
+void NaiveExecutor::CloneLiteEngine(int num, void *stream) {
 #ifdef PADDLE_WITH_LITE
   for (auto &op : ops_) {
     if (op->Type() == "lite_engine") {
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 8388bfe3a37fc1..1f56805a870209 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -90,7 +90,7 @@ class NaiveExecutor {
 
   void ResetTrtOps(int num);
 
-  void CloneLiteEnigne(int num, void* stream);
+  void CloneLiteEngine(int num, void* stream);
 
   void RegisterOutputHook(const HookFunc& hookfunc);
   void RegisterInputHook(const HookFunc& hookfunc);
diff --git a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
index 0a713b89727f61..99829de387c321 100644
--- a/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
+++ b/paddle/fluid/framework/new_executor/feed_fetch_utils.cc
@@ -77,7 +77,7 @@ void SplitFeedTensors(const std::vector<std::string>& feed_names,
                       0,
                       phi::errors::InvalidArgument(
                           "Split expects feed data (%s)'s dim[0] (%d) is "
-                          "diviable by micro_batch_num (%d).",
+                          "divisible by micro_batch_num (%d).",
                           feed_names[i],
                           numel_size,
                           micro_batch_num));
@@ -211,7 +211,7 @@ void MergeTensors(const std::vector<const phi::DenseTensor*>& tensors,
               tensor_dims[j],
               new_dim[j],
               phi::errors::InvalidArgument(
-                  "DenseTensor.ddim[%d] should eaqual to %d, but is %d",
+                  "DenseTensor.ddim[%d] should equal to %d, but is %d",
                   j,
                   new_dim[j],
                   tensor_dims[j]));
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
index d7ad210102b94b..523842438d3555 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
@@ -251,7 +251,7 @@ void IfInstruction::Run() {
     false_branch_inter_->Run({}, false);
     CopyBranchOutput(false_branch_outputs_, false_branch_inter_);
   }
-  // copy ouptut
+  // copy output
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
index ceee6c7d91739e..720fb521439fc5 100644
--- a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
@@ -89,7 +89,7 @@ void CustomKernelInstruction::BuildCustomContext(
         input_ptrs_.emplace_back(nullptr);
         custom_kernel_ctx_.EmplaceBackInput(std::move(paddle::Tensor()));
       }
-      VLOG(8) << "ctx->EmplaceBackInput : an optioanl input " << t;
+      VLOG(8) << "ctx->EmplaceBackInput : an optional input " << t;
       continue;
     }
     auto in_var_name = value_exec_info_.GetVarName(ptr);
@@ -285,7 +285,7 @@ void CustomKernelInstruction::BuildCustomContext(
       cache_out_ptrs_.emplace_back(nullptr);
       custom_kernel_ctx_.EmplaceBackOutput(std::move(paddle::Tensor()));
 
-      VLOG(8) << "ctx->EmplaceBackOutput : an optioanl output";
+      VLOG(8) << "ctx->EmplaceBackOutput : an optional output";
       continue;
     }
 
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
index a788d7d898b1b7..237e8baa95daff 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -32,7 +32,7 @@ PADDLE_DEFINE_EXPORTED_bool(
 
 // The difference between "sequential_run" and "serial_run":
 // "sequential_run" dispatches OPs one by one according to the sequence in the
-// Program, while "serial_run" ensures that all Ops are scheduled in a singal
+// Program, while "serial_run" ensures that all Ops are scheduled in a signal
 // thread. In standalone executor, "sequential_run" is also "serial_run", while
 // "serial_run" is not necessarily "sequential_run".
 PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run,
@@ -206,13 +206,13 @@ void DependencyBuilder::AddDependencyForCoalesceTensorOp() {
       }
 
       // find first op read 'outputs' between (first_read_fused_out_op, end)
-      // add depned:  first_read_fused_out_op -> first op that reads 'outputs'
+      // add depend:  first_read_fused_out_op -> first op that reads 'outputs'
 
       // special case for consecutive communication ops, for example,
       // FusedOutput = c_sync_calc_stream(FusedOutput)
       // FusedOutput= c_allreduce_sum(FusedOutput)
       // FusedOutput = c_sync_comm_stream(FusedOutput)
-      // we should take the last one to add depned instead of
+      // we should take the last one to add depend instead of
       // 'first_read_fused_out_op'
       size_t target = first_read_fused_out_op;
       for (size_t j = first_read_fused_out_op + 1; j < op_num_; ++j) {
@@ -355,8 +355,8 @@ void DependencyBuilder::AddDownstreamOp(size_t prior_op_idx,
   std::set<size_t>& downstream_ops = (*op_downstream_map_)[prior_op_idx];
   // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore
   // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example,
-  // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) ->
-  // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by
+  // a->c will not be shrunk in the following case: AddDownstreamOp(a, b) ->
+  // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrunk by
   // ShrinkDownstreamMap.
   for (size_t op_idx : downstream_ops) {
     if (OpHappensBefore(op_idx, posterior_op_idx)) {
@@ -531,7 +531,7 @@ void DependencyBuilder::ShrinkDownstreamMap() {
       }
     }
     // NOTE(Ruibiao): op_happens_before will not be changed when shrink
-    // dowstream map
+    // downstream map
     (*op_downstream_map_)[i] = minumum_nexts;
   }
   VLOG(8) << "Finish shrink downstream map";
@@ -963,7 +963,7 @@ void DependencyBuilderSimplify::ShrinkDownstreamMap() {
       }
     }
     // NOTE(Ruibiao): op_happens_before will not be changed when shrink
-    // dowstream map
+    // downstream map
     op_downstream_map_.at(i) = minumum_nexts;
   }
   VLOG(8) << "Finish shrink downstream map";
@@ -1031,13 +1031,13 @@ void DependencyBuilderSimplify::AddDependencyForCoalesceTensorOp() {
       }
 
       // find first op read 'outputs' between (first_read_fused_out_op, end)
-      // add depned:  first_read_fused_out_op -> first op that reads 'outputs'
+      // add depend:  first_read_fused_out_op -> first op that reads 'outputs'
 
       // special case for consecutive communication ops, for example,
       // FusedOutput = c_sync_calc_stream(FusedOutput)
       // FusedOutput= c_allreduce_sum(FusedOutput)
       // FusedOutput = c_sync_comm_stream(FusedOutput)
-      // we should take the last one to add depned instead of
+      // we should take the last one to add depend instead of
       // 'first_read_fused_out_op'
       size_t target = first_read_fused_out_op;
       for (size_t j = first_read_fused_out_op + 1; j < op_num_; ++j) {
@@ -1236,8 +1236,8 @@ void DependencyBuilderSimplify::SetSameStream() {
   }
 }
 
-// get_new_exector_order  by dfs
-std::vector<size_t> DependencyBuilderSimplify::get_new_exexutor_order() {
+// get_new_executor_order  by dfs
+std::vector<size_t> DependencyBuilderSimplify::get_new_executor_order() {
   PADDLE_ENFORCE_EQ(
       is_build_,
       true,
@@ -1288,17 +1288,17 @@ std::vector<size_t> DependencyBuilderSimplify::get_new_exexutor_order() {
     is_visit[op_idx] = true;
   }
 
-  std::vector<size_t> dependecy_count(op_num_, 0);
+  std::vector<size_t> dependency_count(op_num_, 0);
   for (auto it : op_downstream_map_) {
     for (auto op_idx : it.second) {
-      dependecy_count[op_idx]++;
+      dependency_count[op_idx]++;
     }
   }
   std::stack<size_t> s;
   std::priority_queue<std::pair<size_t, size_t>> pq;
 
   for (size_t op_idx = op_num_ - 1; op_idx >= start_index_; op_idx--) {
-    if (dependecy_count[op_idx] == 0) {
+    if (dependency_count[op_idx] == 0) {
       pq.push(std::make_pair(op_behind_num[op_idx], op_idx));
     }
   }
@@ -1318,7 +1318,7 @@ std::vector<size_t> DependencyBuilderSimplify::get_new_exexutor_order() {
       for (auto it = op_downstream_map_[current].rbegin();
            it != op_downstream_map_[current].rend();
            it++) {
-        if (--dependecy_count[*it] == 0 && !not_usefull_op.count(current)) {
+        if (--dependency_count[*it] == 0 && !not_usefull_op.count(current)) {
           pq.push(std::make_pair(op_behind_num[*it], *it));
           // s.push(*it);
         }
@@ -1383,8 +1383,8 @@ void DependencyBuilderSimplify::AddDownstreamOp(size_t prior_op_idx,
   std::set<size_t>& downstream_ops = op_downstream_map_[prior_op_idx];
   // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore
   // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example,
-  // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) ->
-  // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by
+  // a->c will not be shrunk in the following case: AddDownstreamOp(a, b) ->
+  // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrunk by
   // ShrinkDownstreamMap.
   for (size_t op_idx : downstream_ops) {
     if (OpHappensBefore(op_idx, posterior_op_idx)) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
index 5670a8ea043476..bec13301ae996f 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
@@ -27,7 +27,7 @@ class InstructionBase;
 namespace interpreter {
 
 // DependencyBuilder provides some dependency adding function to handle the
-// dependency that cannot be explicitly expresed by a Program. It is a
+// dependency that cannot be explicitly expressed by a Program. It is a
 // compromise of the incomplete expression ability of the Program. Do not add
 // too many functions here at will, that will bring great burden to the
 // Interpretercore.
@@ -146,7 +146,7 @@ class DependencyBuilderSimplify {
         phi::errors::Unavailable("op_happen_before is not yet built"));
     return op_happens_before_.at(prior_op_idx).at(posterior_op_idx);
   }
-  std::vector<size_t> get_new_exexutor_order();
+  std::vector<size_t> get_new_executor_order();
 
  private:
   void AddDependencyForCoalesceTensorOp();
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index a336e2c377dfd1..9ad65274846a6b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -315,7 +315,7 @@ void Instruction::AddInplace(Variable* in, Variable* out) {
 void Instruction::ClearInplace() { vec_inplace_in_to_out_.clear(); }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-void Instruction::UpdataRecordStreamForGcInfo() {
+void Instruction::UpdateRecordStreamForGcInfo() {
   if (!IsInterpretercoreFastGCEnabled() ||
       KernelType() != OpFuncType::kGpuAsync) {
     return;
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 66773746deb274..ad74b5fc60746d 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -309,7 +309,7 @@ class Instruction {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool need_record_stream_for_gc_ = false;
   gpuStream_t stream_{nullptr};
-  void UpdataRecordStreamForGcInfo();
+  void UpdateRecordStreamForGcInfo();
 #endif
 
   bool can_use_infermeta_ctx_ = false;
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 932f3c32830842..fcb190a7999223 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -107,7 +107,7 @@ PirInterpreter::PirInterpreter(const platform::Place& place,
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
   completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
-  dependecy_count_ = std::make_shared<std::vector<size_t>>();
+  dependency_count_ = std::make_shared<std::vector<size_t>>();
 
   if (!FLAGS_new_executor_use_local_scope) {
     execution_config_.create_local_scope = false;
@@ -170,7 +170,7 @@ PirInterpreter::PirInterpreter(
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
   completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
-  dependecy_count_ = std::make_shared<std::vector<size_t>>();
+  dependency_count_ = std::make_shared<std::vector<size_t>>();
 
   if (!FLAGS_new_executor_use_local_scope) {
     execution_config_.create_local_scope = false;
@@ -207,7 +207,7 @@ PirInterpreter::PirInterpreter(
 }
 
 PirInterpreter::~PirInterpreter() {
-  // cancle gc's thread
+  // cancel gc's thread
   gc_.reset(nullptr);
   async_work_queue_.reset();
   VLOG(4) << "~PirInterpreter(): " << this << " on " << place_;
@@ -289,7 +289,7 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
   }
   // share op dependency
   ir_dependency_builder_.ShareDependencyFrom(impl.GetPirDependencyBuilder());
-  dependecy_count_ = impl.GetDependencyCount();
+  dependency_count_ = impl.GetDependencyCount();
   // share event analysis
   ir_stream_analyzer_.ShareEventInfoFrom(impl.GetPirStreamAnalyzer());
   is_shared_results_build_ = true;
@@ -313,7 +313,7 @@ PirInterpreter::GetPirDependencyBuilder() const {
 
 std::shared_ptr<std::vector<size_t>> PirInterpreter::GetDependencyCount()
     const {
-  return dependecy_count_;
+  return dependency_count_;
 }
 
 const interpreter::PirStreamAnalyzer& PirInterpreter::GetPirStreamAnalyzer()
@@ -629,8 +629,8 @@ void PirInterpreter::AnalyseExecuteOrderForTrace(
   if (VLOG_IS_ON(2)) {
     ss << "\nLeaf nodes: ";
   }
-  for (size_t instr_id = 0; instr_id < dependecy_count_->size(); ++instr_id) {
-    if ((*dependecy_count_)[instr_id] == 0) {
+  for (size_t instr_id = 0; instr_id < dependency_count_->size(); ++instr_id) {
+    if ((*dependency_count_)[instr_id] == 0) {
       ready_ops.push(instr_id);
       if (VLOG_IS_ON(2)) {
         ss << instr_id << "[" << vec_instruction_base_[instr_id]->Name()
@@ -663,9 +663,9 @@ void PirInterpreter::AnalyseExecuteOrderForTrace(
 
   PADDLE_ENFORCE_EQ(
       trace_order.size(),
-      dependecy_count_->size(),
+      dependency_count_->size(),
       platform::errors::PreconditionNotMet(
-          "trace_order size should be equal to dependecy_count_."));
+          "trace_order size should be equal to dependency_count_."));
 
   trace_execute_order_ = trace_order;
 
@@ -893,11 +893,11 @@ std::vector<std::string> PirInterpreter::DebugInfo() {
 
 void PirInterpreter::BuildInstructionDependences() {
   // analysis the dependences between instructions, add next_instr_list to each
-  // instr, and set the dependecy_count_
+  // instr, and set the dependency_count_
   size_t instr_num = vec_instruction_base_.size();
-  dependecy_count_ = GetDependencyCount();
+  dependency_count_ = GetDependencyCount();
   if (!is_shared_results_build_) {
-    dependecy_count_->assign(instr_num, 0);
+    dependency_count_->assign(instr_num, 0);
   }
   std::vector<paddle::framework::InstructionBase*> instructions_ptr;
   for (auto& instr : vec_instruction_base_) {
@@ -940,7 +940,7 @@ void PirInterpreter::BuildInstructionDependences() {
 
     if (!is_shared_results_build_) {
       for (size_t next_instr_id : next_instr_ids) {
-        ++(*dependecy_count_)[next_instr_id];
+        ++(*dependency_count_)[next_instr_id];
       }
     }
   }
@@ -1013,7 +1013,7 @@ void PirInterpreter::RecordStreamForGC(InstructionBase* instr) {
       memory::RecordStream(allocation, stream);
     } else if (platform::is_cuda_pinned_place(place)) {
       // TODO(Ruibiao): Here should do something to make sure that the tensor
-      // is not freed until the H2D copies done. However, simplely launch a
+      // is not freed until the H2D copies done. However, simply launch a
       // CUDA runtime callback to the H2D stream may lead a high performance
       // overhead. As all the cases we meet in H2D are copies from CPUPlace at
       // present, we just log a WARNING here. A better design is required.
@@ -1037,7 +1037,7 @@ void PirInterpreter::RecordStreamForGC(InstructionBase* instr) {
    * async CUDA kernel.
    *
    * Here we only process the first condition, because:
-   * 1. Since the RecordStream function will directly return when the recored
+   * 1. Since the RecordStream function will directly return when the recorded
    * stream is equal to the owning stream, recording a stream same as which
    * initialized this tensor has less time overhead. Conversely, it may take
    * more time if we try to extract those cross-stream input vars from
@@ -1235,7 +1235,7 @@ void PirInterpreter::CalculateLastLiveOps() {
   }
   VLOG(4) << "shrink the last_live_ops list for all vars in skip_gc_vars";
 
-  for (auto& dep : *dependecy_count_) {
+  for (auto& dep : *dependency_count_) {
     deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
   }
   for (size_t i = 0; i < value_exe_info_->GetVarList().size(); ++i) {
@@ -1246,8 +1246,8 @@ void PirInterpreter::CalculateLastLiveOps() {
 }
 
 void PirInterpreter::ConstructEventForJitInput() {
-  for (size_t i = 0; i < dependecy_count_->size(); ++i) {
-    if ((*dependecy_count_)[i] == 0) {
+  for (size_t i = 0; i < dependency_count_->size(); ++i) {
+    if ((*dependency_count_)[i] == 0) {
       InstructionBase* inst = vec_instruction_base_[i].get();
       if (inst->Name() == "pd_op.memcpy_d2h" &&
           platform::is_gpu_place(place_)) {
@@ -1514,8 +1514,8 @@ void PirInterpreter::TraceRunInstructionList(
     }
   }
 
-  for (size_t i = 0; i < dependecy_count_->size(); ++i) {
-    if ((*dependecy_count_)[i] == 0) {
+  for (size_t i = 0; i < dependency_count_->size(); ++i) {
+    if ((*dependency_count_)[i] == 0) {
       // NOTE(zhiqiu): hot fix for jit input var
       RecordMemcpyD2H(vec_instr.at(i).get());
     }
@@ -1571,8 +1571,8 @@ void PirInterpreter::MultiThreadRunInstructionList(
     }
   }
 
-  for (size_t i = 0; i < dependecy_count_->size(); ++i) {
-    if ((*dependecy_count_)[i] == 0) {
+  for (size_t i = 0; i < dependency_count_->size(); ++i) {
+    if ((*dependency_count_)[i] == 0) {
       // NOTE(zhiqiu): hot fix for jit input var
       RecordMemcpyD2H(vec_instr.at(i).get());
       if (FLAGS_new_executor_serial_run) {
@@ -1794,7 +1794,7 @@ void PirInterpreter::PreAnalysis() {
   BuildInstructionDependences();
   VLOG(4) << "Done BuildInstructionDependences";
 
-  ir_stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_);
+  ir_stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
   ir_stream_analyzer_.ConstructEvents(vec_instruction_base_);
   VLOG(4) << "Done ConstructEvents";
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
index be959ca723163e..ee1bd662820c99 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -111,13 +111,13 @@ class PirInterpreter : public InterpreterBaseImpl {
 
   std::unordered_map<std::string, std::shared_ptr<EventInter>>*
   GetForceEventsToWaitInfo() {
-    return force_evnets_to_wait_;
+    return force_events_to_wait_;
   }
 
   void SetForceEventsToWaitInfo(
       std::unordered_map<std::string, std::shared_ptr<EventInter>>*
-          force_evnets_to_wait) {
-    force_evnets_to_wait_ = force_evnets_to_wait;
+          force_events_to_wait) {
+    force_events_to_wait_ = force_events_to_wait;
   }
 
  private:
@@ -168,7 +168,7 @@ class PirInterpreter : public InterpreterBaseImpl {
   ExecutionConfig execution_config_;
 
   std::unordered_map<std::string, std::shared_ptr<EventInter>>*
-      force_evnets_to_wait_;
+      force_events_to_wait_;
 
   VariableScope var_scope_;
   Scope* scope_{nullptr};
@@ -187,9 +187,9 @@ class PirInterpreter : public InterpreterBaseImpl {
   // var
   std::map<size_t, std::set<size_t>> last_live_ops_;
 
-  // (*dependecy_count_)[i] contains the number of dependencies that the i-th op
-  // need to wait
-  std::shared_ptr<std::vector<size_t>> dependecy_count_;
+  // (*dependency_count_)[i] contains the number of dependencies that the i-th
+  // op need to wait
+  std::shared_ptr<std::vector<size_t>> dependency_count_;
 
   std::vector<std::shared_ptr<interpreter::OpDepInfo>> deps_;
   std::vector<std::shared_ptr<interpreter::VarRefInfo>> refs_;
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 0f50665e1621e8..e6e3060cda94a5 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -63,7 +63,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
   completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
-  dependecy_count_ = std::make_shared<std::vector<size_t>>();
+  dependency_count_ = std::make_shared<std::vector<size_t>>();
 
   if (!FLAGS_new_executor_use_local_scope) {
     execution_config_.create_local_scope = false;
@@ -100,7 +100,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
 }
 
 ProgramInterpreter::~ProgramInterpreter() {
-  // cancle gc's thread
+  // cancel gc's thread
   gc_.reset(nullptr);
   async_work_queue_.reset();
   VLOG(4) << "~ProgramInterpreter(): " << this << " on " << place_;
@@ -355,7 +355,7 @@ void ProgramInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
   }
   // share op dependency
   dependency_builder_.ShareDependencyFrom(impl.GetDependencyBuilder());
-  dependecy_count_ = impl.GetDependencyCount();
+  dependency_count_ = impl.GetDependencyCount();
   // share event analysis
   stream_analyzer_.ShareEventInfoFrom(impl.GetStreamAnalyzer());
   is_shared_results_build_ = true;
@@ -399,7 +399,7 @@ const interpreter::DependencyBuilder& ProgramInterpreter::GetDependencyBuilder()
 
 std::shared_ptr<std::vector<size_t>> ProgramInterpreter::GetDependencyCount()
     const {
-  return dependecy_count_;
+  return dependency_count_;
 }
 
 const interpreter::StreamAnalyzer& ProgramInterpreter::GetStreamAnalyzer()
@@ -452,7 +452,7 @@ void ProgramInterpreter::BuildAndCacheInstructionCtx(Instruction* instr_node) {
 void ProgramInterpreter::BuildInplace() {
   // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput phi::DenseTensor
   // and a list of Output Tensors which are sliced from the FusedOutput. These
-  // outputs sholud not be the outvar of the in-place var-pair since memory
+  // outputs should not be the outvar of the in-place var-pair since memory
   // reuse between FusedOutput and Output Tensors is assumed. For the following
   // example:
   // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
@@ -603,11 +603,11 @@ void ProgramInterpreter::CheckCUDAGraphBeforeRun(
 
 void ProgramInterpreter::BuildOperatorDependences() {
   // analysis the dependences between ops, add next_instr_list to each instr,
-  // and set the dependecy_count_
+  // and set the dependency_count_
   size_t instr_num = vec_instruction_.size();
-  dependecy_count_ = GetDependencyCount();
+  dependency_count_ = GetDependencyCount();
   if (!is_shared_results_build_) {
-    dependecy_count_->assign(instr_num, 0);
+    dependency_count_->assign(instr_num, 0);
   }
 
   auto downstream_map = dependency_builder_.Build(vec_instruction_);
@@ -647,7 +647,7 @@ void ProgramInterpreter::BuildOperatorDependences() {
 
     if (!is_shared_results_build_) {
       for (size_t next_instr_id : next_instr_ids) {
-        ++(*dependecy_count_)[next_instr_id];
+        ++(*dependency_count_)[next_instr_id];
       }
     }
   }
@@ -692,8 +692,8 @@ void ProgramInterpreter::Convert(
 
   // add event for the input var of jit program, since there are async copied
   // from gpu_pinned place to gpu place on compute stream.
-  for (size_t i = 0; i < dependecy_count_->size(); ++i) {
-    if ((*dependecy_count_)[i] == 0) {
+  for (size_t i = 0; i < dependency_count_->size(); ++i) {
+    if ((*dependency_count_)[i] == 0) {
       auto& inst = vec_instruction_[i];
       if (inst.OpBase()->Type() == interpreter::kMemcpyD2H &&
           platform::is_gpu_place(place_)) {
@@ -840,7 +840,7 @@ void ProgramInterpreter::Convert(
     BuildInplace();
   }
 
-  for (auto& dep : *dependecy_count_) {
+  for (auto& dep : *dependency_count_) {
     deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
   }
   for (size_t i = 0; i < vec_meta_info.size(); ++i) {
@@ -860,7 +860,7 @@ void ProgramInterpreter::BuildOpFuncNode(
   vec_instruction_.reserve(op_nums);
   for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
     auto& op_func_node = nodes[op_idx];
-    stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_);
+    stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
     auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
 #ifdef PADDLE_WITH_CUDA
     if (FLAGS_new_executor_use_cuda_graph) {
@@ -887,7 +887,7 @@ void ProgramInterpreter::BuildOpFuncNode(
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    vec_instruction_.back().UpdataRecordStreamForGcInfo();
+    vec_instruction_.back().UpdateRecordStreamForGcInfo();
 #endif
   }
 }
@@ -1245,8 +1245,8 @@ void ProgramInterpreter::ExecuteInstructionList(
     }
   }
 
-  for (size_t i = 0; i < dependecy_count_->size(); ++i) {
-    if ((*dependecy_count_)[i] == 0) {
+  for (size_t i = 0; i < dependency_count_->size(); ++i) {
+    if ((*dependency_count_)[i] == 0) {
       // NOTE(zhiqiu): hot fix for jit input var
       RecordMemcpyD2H(vec_instr.at(i));
       if (FLAGS_new_executor_serial_run) {
@@ -1396,7 +1396,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
       memory::RecordStream(allocation, stream);
     } else if (platform::is_cuda_pinned_place(place)) {
       // TODO(Ruibiao): Here should do something to make sure that the tensor
-      // is not freed until the H2D copies done. However, simplely launch a
+      // is not freed until the H2D copies done. However, simply launch a
       // CUDA runtime callback to the H2D stream may lead a high performance
       // overhead. As all the cases we meet in H2D are copies from CPUPlace at
       // present, we just log a WARNING here. A better design is required.
@@ -1420,7 +1420,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
    * async CUDA kernel.
    *
    * Here we only process the first condition, because:
-   * 1. Since the RecordStream function will directly return when the recored
+   * 1. Since the RecordStream function will directly return when the recorded
    * stream is equal to the owning stream, recording a stream same as which
    * initialized this tensor has less time overhead. Conversely, it may take
    * more time if we try to extract those cross-stream input vars from
@@ -1609,8 +1609,8 @@ void ProgramInterpreter::TraceInstructionList(
 
   exception_holder_.Clear();
 
-  for (size_t i = 0; i < dependecy_count_->size(); ++i) {
-    if ((*dependecy_count_)[i] == 0) {
+  for (size_t i = 0; i < dependency_count_->size(); ++i) {
+    if ((*dependency_count_)[i] == 0) {
       // NOTE(zhiqiu): hot fix for jit input var
       RecordMemcpyD2H(vec_instr.at(i));
     }
@@ -1687,8 +1687,8 @@ void ProgramInterpreter::AnalyseExecuteOrderForTrace() {
   std::vector<size_t> trace_order;
   SchedulingQueue ready_ops(instruction_scheduling_priority_less);
 
-  for (size_t instr_id = 0; instr_id < dependecy_count_->size(); ++instr_id) {
-    if ((*dependecy_count_)[instr_id] == 0) {
+  for (size_t instr_id = 0; instr_id < dependency_count_->size(); ++instr_id) {
+    if ((*dependency_count_)[instr_id] == 0) {
       ready_ops.push(instr_id);
     }
   }
@@ -1709,9 +1709,9 @@ void ProgramInterpreter::AnalyseExecuteOrderForTrace() {
 
   PADDLE_ENFORCE_EQ(
       trace_order.size(),
-      dependecy_count_->size(),
+      dependency_count_->size(),
       platform::errors::PreconditionNotMet(
-          "trace_order size should be equal to dependecy_count_."));
+          "trace_order size should be equal to dependency_count_."));
 
   trace_execute_order_ = trace_order;
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 5359c41fddcdc6..7e956249e22a38 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -103,13 +103,13 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   std::unordered_map<std::string, std::shared_ptr<EventInter>>*
   GetForceEventsToWaitInfo() {
-    return force_evnets_to_wait_;
+    return force_events_to_wait_;
   }
 
   void SetForceEventsToWaitInfo(
       std::unordered_map<std::string, std::shared_ptr<EventInter>>*
-          force_evnets_to_wait) {
-    force_evnets_to_wait_ = force_evnets_to_wait;
+          force_events_to_wait) {
+    force_events_to_wait_ = force_events_to_wait;
   }
 
   bool IsStaticBuild() const override { return static_build_; }
@@ -205,7 +205,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   ExecutionConfig execution_config_;
 
   std::unordered_map<std::string, std::shared_ptr<EventInter>>*
-      force_evnets_to_wait_;
+      force_events_to_wait_;
 
   VariableScope var_scope_;
   Scope* local_scope_{nullptr};  // not owned
@@ -223,9 +223,9 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   // var
   std::map<size_t, std::set<size_t>> last_live_ops_;
 
-  // (*dependecy_count_)[i] contains the number of dependencies that the i-th op
-  // need to wait
-  std::shared_ptr<std::vector<size_t>> dependecy_count_;
+  // (*dependency_count_)[i] contains the number of dependencies that the i-th
+  // op need to wait
+  std::shared_ptr<std::vector<size_t>> dependency_count_;
 
   std::vector<std::shared_ptr<interpreter::OpDepInfo>> deps_;
   std::vector<std::shared_ptr<interpreter::VarRefInfo>> refs_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 491fda2e9d59a3..6bc29d918d124e 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -59,7 +59,7 @@ PADDLE_DEFINE_EXPORTED_string(
     pe_profile_fname,
     "",
     "Profiler filename for PE, which generated by gperftools."
-    "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
 
 namespace paddle {
 namespace framework {
@@ -124,7 +124,7 @@ class ParallelExecutorPrivate {
    * NOTE(zengjinle): the fed variables of users should not be reused,
    * because users may feed them into another network. Changing the fed
    * variables that users can visit may cause calculation wrong, which is
-   * a very subtle bug when traning networks. However, these variables
+   * a very subtle bug when training networks. However, these variables
    * can be garbage collected.
    *
    * ParallelExecutor provides 2 methods to feed variables:
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index 7daab881bea464..4cc03b95abc525 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -185,7 +185,7 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) {
             << dest_table;
     copy_dense_tables_.push_back(std::make_pair(src_table, dest_table));
   }
-  for (auto& m : copy_table_config_.table_denpendency_map()) {
+  for (auto& m : copy_table_config_.table_dependency_map()) {
     if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) {
       // currently only support one dependency
       for (auto& value : m.values()) {
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index e3abbd210f25a4..218ec423b6baf9 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -172,7 +172,7 @@ message CopyTableConfig {
   optional bool sparse_copy_by_feasign = 10 [ default = true ];
   // table dependency for pull/push
   optional bool enable_dependency = 11 [ default = false ];
-  repeated TableDependencyMap table_denpendency_map = 12;
+  repeated TableDependencyMap table_dependency_map = 12;
 }
 
 message CondTableMap {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 28bb03e52c8e74..ba3577694c55ac 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2875,10 +2875,10 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
 #endif
 #ifdef PADDLE_WITH_LITE
 #ifdef LITE_SUBGRAPH_WITH_XPU
-  x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_,
+  x->executor_->CloneLiteEngine(++AnalysisPredictor::clone_num_,
                                 config_.xpu_config_.stream);
 #else
-  x->executor_->CloneLiteEnigne(++AnalysisPredictor::clone_num_, nullptr);
+  x->executor_->CloneLiteEngine(++AnalysisPredictor::clone_num_, nullptr);
 #endif
 #endif
   return std::unique_ptr<PaddlePredictor>(x);
diff --git a/paddle/fluid/jit/property.h b/paddle/fluid/jit/property.h
index 4a3fae89f6a9dc..6b9889818251cd 100644
--- a/paddle/fluid/jit/property.h
+++ b/paddle/fluid/jit/property.h
@@ -109,7 +109,7 @@ class Property {
 
   // Note: the id_ is unique for all Property (only for auto parallel).
   uint64_t id_ = GenerateId();
-  // Note: the orignal_id_ is used for referring to the original Property
+  // Note: the original_id_ is used for referring to the original Property
   // that the current Property is built from (only for auto parallel).
   // The default original_id_ is same as the id_, which means the
   // current Property is not built from the other one.
diff --git a/python/paddle/base/trainer_desc.py b/python/paddle/base/trainer_desc.py
index 65436105d05930..69a5ef4b4433d9 100644
--- a/python/paddle/base/trainer_desc.py
+++ b/python/paddle/base/trainer_desc.py
@@ -286,7 +286,7 @@ def _set_copy_table_config(self, config_dict):
 
         dependency_map = config_dict.get("dependency_map", {})
         for key in dependency_map:
-            m = config.table_denpendency_map.add()
+            m = config.table_dependency_map.add()
             m.key = key
             values = dependency_map[key]
             if not isinstance(values, list):

From 39177b2f6279f03fb1b2765368515ae59fb12129 Mon Sep 17 00:00:00 2001
From: co63oc <co63@163.com>
Date: Sun, 28 Jan 2024 12:41:49 +0800
Subject: [PATCH 2/2] ci