[CPU-PSLIB] Modify scale_sparse_grad to scale_sparse_gradient_with_batch_size

WorgenZhang · WorgenZhang · commit 351852280ccd · 2021-08-16T14:29:00.000+08:00
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
@@ -212,7 +212,7 @@ class DeviceWorker {
   FetchConfig fetch_config_;
   bool use_cvm_;
   bool no_cvm_;
-  bool scale_sparse_grad_;
+  bool scale_sparse_gradient_with_batch_size_;
   TrainerDesc trainer_desc_;
 
   // dump params or grads for debug
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
@@ -89,7 +89,8 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
   use_cvm_ = desc.use_cvm();
   // for sparse value accessor, embedding only
   no_cvm_ = desc.no_cvm();
-  scale_sparse_grad_ = desc.scale_sparse_grad();
+  scale_sparse_gradient_with_batch_size_ =
+      desc.scale_sparse_gradient_with_batch_size();
   scale_datanorm_ = desc.scale_datanorm();
   dump_slot_ = desc.dump_slot();
   adjust_ins_weight_config_ = desc.adjust_ins_weight_config();
@@ -592,7 +593,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
             &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_, scale_sparse_grad_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
         timeline.Pause();
         push_sparse_time += timeline.ElapsedSec();
         total_time += timeline.ElapsedSec();
@@ -867,7 +869,8 @@ void DownpourWorker::TrainFiles() {
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
             &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_, scale_sparse_grad_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
       }
     }
 
diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -450,12 +450,13 @@ void DownpourWorkerOpt::TrainFiles() {
             break;
           }
         }
-        bool scale_sparse_grad_ = true;
+        bool scale_sparse_gradient_with_batch_size_ = true;
         fleet_ptr_->PushSparseVarsWithLabelAsync(
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
             &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,
-            dump_slot_, &sparse_push_keys_[tid], no_cvm_, scale_sparse_grad_);
+            dump_slot_, &sparse_push_keys_[tid], no_cvm_,
+            scale_sparse_gradient_with_batch_size_);
       }
     }
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -871,7 +871,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     std::vector<::std::future<int32_t>>* push_sparse_status,
     const int batch_size, const bool use_cvm, const bool dump_slot,
     std::vector<uint64_t>* sparse_push_keys, const bool no_cvm,
-    const bool scale_sparse_grad) {
+    const bool scale_sparse_gradient_with_batch_size) {
 #ifdef PADDLE_WITH_PSLIB
   int offset = 2;
   int slot_offset = 0;
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -210,7 +210,7 @@ class FleetWrapper {
       std::vector<::std::future<int32_t>>* push_sparse_status,
       const int batch_size, const bool use_cvm, const bool dump_slot,
       std::vector<uint64_t>* sparse_push_keys, const bool no_cvm,
-      const bool scale_sparse_grad);
+      const bool scale_sparse_gradient_with_batch_size);
 
   // Push sparse variables to server in async mode
   void PushSparseFromTensorWithLabelAsync(
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
@@ -61,7 +61,7 @@ message TrainerDesc {
 
   optional bool use_ps_gpu = 32 [ default = false ];
   optional string user_define_dump_filename = 33;
-  optional bool scale_sparse_grad = 34 [ default = true ];
+  optional bool scale_sparse_gradient_with_batch_size = 34 [ default = true ];
 
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -825,7 +825,8 @@ def _minimize(self,
         opt_info["worker_skipped_ops"] = worker_skipped_ops
         opt_info["use_cvm"] = strategy.get("use_cvm", False)
         opt_info["no_cvm"] = strategy.get("no_cvm", False)
-        opt_info["scale_sparse_grad"] = strategy.get("scale_sparse_grad", True)
+        opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get(
+            "scale_sparse_gradient_with_batch_size", True)
         opt_info["worker_class"] = strategy.get("worker_class",
                                                 "DownpourWorker")
         opt_info["stat_var_names"] = strategy.get("stat_var_names", [])
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
@@ -124,8 +124,9 @@ def _set_use_cvm(self, use_cvm=False):
     def _set_no_cvm(self, no_cvm=False):
         self.proto_desc.no_cvm = no_cvm
 
-    def _set_scale_sparse_grad(self, scale_sparse_grad=True):
-        self.proto_desc.scale_sparse_grad = scale_sparse_grad
+    def _set_scale_sparse_grad_with_batch_size(
+            self, scale_sparse_gradient_with_batch_size=True):
+        self.proto_desc.scale_sparse_gradient_with_batch_size = scale_sparse_gradient_with_batch_size
 
     def _set_scale_datanorm(self, scale_datanorm=-1):
         self.proto_desc.scale_datanorm = scale_datanorm
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
@@ -95,9 +95,10 @@ def _create_trainer(self, opt_info=None):
                     trainer._set_use_cvm(opt_info["use_cvm"])
                 if opt_info.get("no_cvm") is not None:
                     trainer._set_no_cvm(opt_info["no_cvm"])
-                if opt_info.get("scale_sparse_grad") is not None:
-                    trainer._set_scale_sparse_grad(opt_info[
-                        "scale_sparse_grad"])
+                if opt_info.get(
+                        "scale_sparse_gradient_with_batch_size") is not None:
+                    trainer._set_scale_sparse_grad_with_batch_size(opt_info[
+                        "scale_sparse_gradient_with_batch_size"])
                 if opt_info.get("scale_datanorm") is not None:
                     trainer._set_scale_datanorm(opt_info["scale_datanorm"])
                 if opt_info.get("adjust_ins_weight") is not None:

Original file line number	Diff line number	Diff line change
`@@ -450,12 +450,13 @@ void DownpourWorkerOpt::TrainFiles() {`
`450`	`450`	`break;`
`451`	`451`	`}`
`452`	`452`	`}`
`453`		`- bool scale_sparse_grad_ = true;`
	`453`	`+ bool scale_sparse_gradient_with_batch_size_ = true;`
`454`	`454`	`fleet_ptr_->PushSparseVarsWithLabelAsync(`
`455`	`455`	`*thread_scope_, tid, features_[tid], feature_labels_[tid],`
`456`	`456`	`sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),`
`457`	`457`	`&feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_,`
`458`		`- dump_slot_, &sparse_push_keys_[tid], no_cvm_, scale_sparse_grad_);`
	`458`	`+ dump_slot_, &sparse_push_keys_[tid], no_cvm_,`
	`459`	`+ scale_sparse_gradient_with_batch_size_);`
`459`	`460`	`}`
`460`	`461`	`}`
`461`	`462`