Merge pull request #6 from Thunderbrook/gpugraph_deepwalk

Thunderbrook · web-flow · commit 47b82ac2ae70 · 2022-05-26T15:16:31.000+08:00
[GpuGraph] remove  useless variables and adjust log level
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
@@ -52,9 +52,6 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place& place,
   device_key_size_ = h_device_keys_->size();
   d_device_keys_ =
       memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
-  for (size_t i = 0; i < h_device_keys_->size(); i++) {
-    VLOG(2) << "h_device_keys_[" << i << "] = " << (*h_device_keys_)[i];
-  }
   CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
                              device_key_size_ * sizeof(int64_t),
                              cudaMemcpyHostToDevice, stream_));
@@ -67,7 +64,6 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place& place,
   cursor_ = 0;
   jump_rows_ = 0;
   device_keys_ = reinterpret_cast<int64_t*>(d_device_keys_->ptr());
-  VLOG(2) << "device_keys_ = " << (uint64_t)device_keys_;
   d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(int64_t));
   cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(int64_t), stream_);
   d_sample_keys_ =
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
@@ -256,7 +256,6 @@ __global__ void GraphDoWalkKernel(int64_t *neighbors, int64_t *walk,
       size_t col = step;
       size_t offset = (row * col_size + col);
       walk[offset] = neighbors[i * cur_degree + k];
-      id_cnt[row] += 1;
     }
   }
 }
@@ -366,7 +365,7 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     h_sample_keys = new int64_t[once_max_sample_keynum];
     h_offset2idx = new int[once_max_sample_keynum];
     h_len_per_row = new int[once_max_sample_keynum];
-    h_prefix_sum = new int64_t[100];
+    h_prefix_sum = new int64_t[once_max_sample_keynum + 1];
   }
   ///////
   auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
@@ -378,7 +377,9 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
                   stream_);
   int i = 0;
   int total_row = 0;
-  while (i < buf_size_) {
+  int remain_size =
+      buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_;
+  while (i <= remain_size) {
     int tmp_len = cursor_ + once_sample_startid_len_ > device_key_size_
                       ? device_key_size_ - cursor_
                       : once_sample_startid_len_;
@@ -389,7 +390,6 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
             << " tmp_len = " << tmp_len << " cursor = " << cursor_
             << " once_max_sample_keynum = " << once_max_sample_keynum;
     int64_t *cur_walk = walk + i;
-    len_per_row += once_max_sample_keynum;
 
     if (debug_mode_) {
       cudaMemcpy(h_walk, walk, buf_size_ * sizeof(int64_t),
@@ -408,14 +408,9 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     if (debug_mode_) {
       cudaMemcpy(h_walk, walk, buf_size_ * sizeof(int64_t),
                  cudaMemcpyDeviceToHost);
-      cudaMemcpy(h_len_per_row, len_per_row,
-                 once_max_sample_keynum * sizeof(int), cudaMemcpyDeviceToHost);
       for (int xx = 0; xx < buf_size_; xx++) {
         VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
       }
-      for (int xx = 0; xx < once_max_sample_keynum; xx++) {
-        VLOG(2) << "h_len_per_row[" << xx << "]: " << h_len_per_row[xx];
-      }
     }
     /////////
     step++;
@@ -433,12 +428,6 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
         for (int xx = 0; xx < buf_size_; xx++) {
           VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
         }
-        cudaMemcpy(h_len_per_row, len_per_row,
-                   once_max_sample_keynum * sizeof(int),
-                   cudaMemcpyDeviceToHost);
-        for (int xx = 0; xx < once_max_sample_keynum; xx++) {
-          VLOG(2) << "h_len_per_row[" << xx << "]: " << h_len_per_row[xx];
-        }
       }
     }
     cursor_ += tmp_len;
@@ -458,6 +447,13 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
   shuffle_seed_ = engine();
 
   if (debug_mode_) {
+    int *h_random_row = new int[total_row + 10];
+    cudaMemcpy(h_random_row, d_random_row, total_row * sizeof(int),
+               cudaMemcpyDeviceToHost);
+    for (int xx = 0; xx < total_row; xx++) {
+      VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx];
+    }
+    delete h_random_row;
     delete[] h_walk;
     delete[] h_sample_keys;
     delete[] h_offset2idx;
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
@@ -870,8 +870,11 @@ struct BufState {
   }
 
   int GetNextBatch() {
+    cursor += len;
     int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
-    cursor += tmp_len;
+    if (tmp_len == 0) {
+      return 0;
+    }
     len = tmp_len;
     central_word = -1;
     step = -1;
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
@@ -179,8 +179,6 @@ void HogwildWorker::TrainFilesWithProfiler() {
     PrintFetchVars();
 #ifdef PADDLE_WITH_HETERPS
     dev_ctx_->Wait();
-    VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time
-            << " seconds, ins_num: " << total_inst;
     for (size_t i = 0; i < op_name.size(); ++i) {
       VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i]
               << ", mean time: " << op_total_time[i] / total_inst
@@ -202,6 +200,9 @@ void HogwildWorker::TrainFilesWithProfiler() {
     thread_scope_->DropKids();
     timeline.Start();
   }
+  VLOG(0) << "GpuPs worker " << thread_id_ << " train cost " << total_time
+          << " seconds, ins_num: " << total_inst << " read time: " << read_time
+          << "seconds ";
 
   if (need_dump_field_ || need_dump_param_) {
     writer_.Flush();
@@ -256,7 +257,7 @@ void HogwildWorker::TrainFiles() {
     thread_scope_->DropKids();
   }
   timeline.Pause();
-  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+  VLOG(0) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
           << " seconds, ins_num: " << total_ins_num;
 
   if (need_dump_field_ || need_dump_param_) {
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
@@ -1041,11 +1041,9 @@ def _set_heter_ps(self, enable_heter_ps=False):
         user no need to call this function.
         """
         self.dataset.set_heter_ps(enable_heter_ps)
-    
+
     def set_graph_device_keys(self, device_keys):
         """
-        Set heter ps mode
-        user no need to call this function.
         """
         self.dataset.set_graph_device_keys(device_keys)
 
@@ -1054,11 +1052,13 @@ def set_graph_config(self, config):
         """
         self.proto_desc.graph_config.walk_degree = config.get("walk_degree", 1)
         self.proto_desc.graph_config.walk_len = config.get("walk_len", 20)
-        self.proto_desc.graph_config.once_sample_startid_len = config.get("once_sample_startid_len", 8000)
-        self.proto_desc.graph_config.sample_times_one_chunk = config.get("sample_times_one_chunk", 10)
+        self.proto_desc.graph_config.window = config.get("window", 5)
+        self.proto_desc.graph_config.once_sample_startid_len = config.get(
+            "once_sample_startid_len", 8000)
+        self.proto_desc.graph_config.sample_times_one_chunk = config.get(
+            "sample_times_one_chunk", 10)
         self.proto_desc.graph_config.batch_size = config.get("batch_size", 1)
         self.proto_desc.graph_config.debug_mode = config.get("debug_mode", 0)
-        
 
 
 class QueueDataset(DatasetBase):