seemingwang
diff --git a/‎paddle/fluid/distributed/ps/table/common_graph_table.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/distributed/ps/table/common_graph_table.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/framework/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/data_feed.cc‎
Lines changed: 53 additions & 18 deletions b/‎paddle/fluid/framework/data_feed.cc‎
Lines changed: 53 additions & 18 deletions
diff --git a/‎paddle/fluid/framework/data_feed.cu‎
Lines changed: 86 additions & 1 deletion b/‎paddle/fluid/framework/data_feed.cu‎
Lines changed: 86 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/data_feed.h‎
Lines changed: 42 additions & 0 deletions b/‎paddle/fluid/framework/data_feed.h‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/data_set.cc‎
Lines changed: 25 additions & 6 deletions b/‎paddle/fluid/framework/data_set.cc‎
Lines changed: 25 additions & 6 deletions
diff --git a/‎paddle/fluid/framework/data_set.h‎
Lines changed: 6 additions & 2 deletions b/‎paddle/fluid/framework/data_set.h‎
Lines changed: 6 additions & 2 deletions
@@ -566,7 +566,7 @@ class GraphTable : public Table {
   int32_t dump_edges_to_ssd(int idx);
   int32_t get_partition_num(int idx) { return partitions[idx].size(); }
   std::vector<int64_t> get_partition(int idx, int index) {
-    if (idx >= partitions.size() || index >= partitions[idx].size())
+    if (idx >= (int)partitions.size() || index >= (int)partitions[idx].size())
       return std::vector<int64_t>();
     return partitions[idx][index];
   }
 
@@ -321,7 +321,9 @@ if(WITH_DISTRIBUTE)
             device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
             index_sampler index_wrapper sampler index_dataset_proto
             lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper metrics lodtensor_printer feed_fetch_method
-            graph_to_program_pass variable_helper timer monitor heter_service_proto fleet heter_server brpc fleet_executor)
+            graph_to_program_pass variable_helper timer monitor
+            heter_service_proto fleet heter_server brpc fleet_executor
+            graph_gpu_wrapper)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=parentheses")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
         set(DISTRIBUTE_COMPILE_FLAGS
 
@@ -38,6 +38,34 @@ DLManager& global_dlmanager_pool() {
   return manager;
 }
 
+void GraphDataGenerator::AllocResource(const paddle::platform::Place& place,
+                                       std::vector<LoDTensor*> feed_vec,
+                                       std::vector<int64_t>* h_device_keys) {
+  place_ = place;
+  gpuid_ = place_.GetDeviceId();
+  VLOG(3) << "gpuid " << gpuid_;
+  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+                platform::DeviceContextPool::Instance().Get(place))
+                ->stream();
+  feed_vec_ = feed_vec;
+  h_device_keys_ = h_device_keys;
+  device_key_size_ = h_device_keys_->size();
+  d_device_keys_ =
+      memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
+  CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
+                             device_key_size_ * sizeof(int64_t),
+                             cudaMemcpyHostToDevice, stream_));
+  d_prefix_sum_ =
+      memory::AllocShared(place_, (sample_key_size_ + 1) * sizeof(int64_t));
+  int64_t* d_prefix_sum_ptr = reinterpret_cast<int64_t*>(d_prefix_sum_->ptr());
+  cudaMemsetAsync(d_prefix_sum_ptr, 0, (sample_key_size_ + 1) * sizeof(int64_t),
+                  stream_);
+  cursor_ = 0;
+  device_keys_ = reinterpret_cast<int64_t*>(d_device_keys_->ptr());
+  ;
+  cudaStreamSynchronize(stream_);
+}
+
 class BufferedLineFileReader {
   typedef std::function<bool()> SampleFunc;
   static const int MAX_FILE_BUFF_SIZE = 4 * 1024 * 1024;
@@ -2065,6 +2093,7 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
   } else {
     so_parser_name_.clear();
   }
+  gpu_graph_data_generator_.SetConfig(data_feed_desc);
 }
 
 void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
@@ -2589,34 +2618,40 @@ bool SlotRecordInMemoryDataFeed::Start() {
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
   CHECK(paddle::platform::is_gpu_place(this->place_));
   pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_);
+  gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_,
+                                          h_device_keys_);
 #endif
   return true;
 }
 
 int SlotRecordInMemoryDataFeed::Next() {
 #ifdef _LINUX
   this->CheckStart();
-
-  VLOG(3) << "enable heter next: " << offset_index_
-          << " batch_offsets: " << batch_offsets_.size();
-  if (offset_index_ >= batch_offsets_.size()) {
-    VLOG(3) << "offset_index: " << offset_index_
+  if (!gpu_graph_mode_) {
+    VLOG(3) << "enable heter next: " << offset_index_
             << " batch_offsets: " << batch_offsets_.size();
-    return 0;
-  }
-  auto& batch = batch_offsets_[offset_index_++];
-  this->batch_size_ = batch.second;
-  VLOG(3) << "batch_size_=" << this->batch_size_
-          << ", thread_id=" << thread_id_;
-  if (this->batch_size_ != 0) {
-    PutToFeedVec(&records_[batch.first], this->batch_size_);
+    if (offset_index_ >= batch_offsets_.size()) {
+      VLOG(3) << "offset_index: " << offset_index_
+              << " batch_offsets: " << batch_offsets_.size();
+      return 0;
+    }
+    auto& batch = batch_offsets_[offset_index_++];
+    this->batch_size_ = batch.second;
+    VLOG(3) << "batch_size_=" << this->batch_size_
+            << ", thread_id=" << thread_id_;
+    if (this->batch_size_ != 0) {
+      PutToFeedVec(&records_[batch.first], this->batch_size_);
+    } else {
+      VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
+              << thread_id_;
+    }
+    VLOG(3) << "enable heter next: " << offset_index_
+            << " batch_offsets: " << batch_offsets_.size()
+            << " baych_size: " << this->batch_size_;
   } else {
-    VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
-            << thread_id_;
+    VLOG(3) << "datafeed in gpu graph mode";
+    this->batch_size_ = gpu_graph_data_generator_.GenerateBatch();
   }
-  VLOG(3) << "enable heter next: " << offset_index_
-          << " batch_offsets: " << batch_offsets_.size()
-          << " baych_size: " << this->batch_size_;
 
   return this->batch_size_;
 #else
 
@@ -17,8 +17,10 @@ limitations under the License. */
 #endif
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
 
+#include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_feed.h"
-
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 namespace paddle {
 namespace framework {
 
@@ -144,6 +146,89 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
   cudaStreamSynchronize(stream);
 }
 
+__global__ void GraphFillIdKernel(int64_t *id_tensor, int *actual_sample_size,
+                                  int64_t *prefix_sum, int64_t *device_key,
+                                  int64_t *neighbors, int sample_size,
+                                  int len) {
+  CUDA_KERNEL_LOOP(idx, len) {
+    for (int k = 0; k < actual_sample_size[idx]; k++) {
+      int offset = (prefix_sum[idx] + k) * 2;
+      id_tensor[offset] = device_key[idx];
+      id_tensor[offset + 1] = neighbors[idx * sample_size + k];
+    }
+  }
+}
+
+__global__ void GraphFillCVMKernel(int64_t *tensor, int len) {
+  CUDA_KERNEL_LOOP(idx, len) { tensor[idx] = 1; }
+}
+
+void GraphDataGenerator::FeedGraphIns(size_t cursor, int len,
+                                      NeighborSampleResult &sample_res) {
+  size_t temp_storage_bytes = 0;
+  int *d_actual_sample_size = sample_res.actual_sample_size;
+  int64_t *d_neighbors = sample_res.val;
+  int64_t *d_prefix_sum = reinterpret_cast<int64_t *>(d_prefix_sum_->ptr());
+  CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, temp_storage_bytes,
+                                           d_actual_sample_size,
+                                           d_prefix_sum + 1, len, stream_));
+  auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
+
+  CUDA_CHECK(cub::DeviceScan::InclusiveSum(
+      d_temp_storage->ptr(), temp_storage_bytes, d_actual_sample_size,
+      d_prefix_sum + 1, len, stream_));
+  cudaStreamSynchronize(stream_);
+  int64_t total_ins = 0;
+  cudaMemcpyAsync(&total_ins, d_prefix_sum + len, sizeof(int64_t),
+                  cudaMemcpyDeviceToHost, stream_);
+
+  total_ins *= 2;
+  id_tensor_ptr_ =
+      feed_vec_[0]->mutable_data<int64_t>({total_ins, 1}, this->place_);
+  show_tensor_ptr_ =
+      feed_vec_[1]->mutable_data<int64_t>({total_ins}, this->place_);
+  clk_tensor_ptr_ =
+      feed_vec_[2]->mutable_data<int64_t>({total_ins}, this->place_);
+
+  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+      id_tensor_ptr_, d_actual_sample_size, d_prefix_sum,
+      device_keys_ + cursor_, d_neighbors, walk_degree_, len);
+  GraphFillCVMKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+      show_tensor_ptr_, total_ins);
+  GraphFillCVMKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+      clk_tensor_ptr_, total_ins);
+
+  offset_.clear();
+  offset_.push_back(0);
+  offset_.push_back(total_ins);
+  LoD lod{offset_};
+  feed_vec_[0]->set_lod(lod);
+  // feed_vec_[1]->set_lod(lod);
+  // feed_vec_[2]->set_lod(lod);
+  cudaStreamSynchronize(stream_);
+}
+
+int GraphDataGenerator::GenerateBatch() {
+  // GpuPsGraphTable *g = (GpuPsGraphTable *)(gpu_graph_ptr->graph_table);
+  platform::CUDADeviceGuard guard(gpuid_);
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  int tmp_len = cursor_ + sample_key_size_ > device_key_size_
+                    ? device_key_size_ - cursor_
+                    : sample_key_size_;
+  VLOG(3) << "device key size: " << device_key_size_
+          << " this batch: " << tmp_len << " cursor: " << cursor_
+          << " sample_key_size_: " << sample_key_size_;
+  if (tmp_len == 0) {
+    return 0;
+  }
+  int total_instance = 1;
+  auto sample_res = gpu_graph_ptr->graph_neighbor_sample(
+      gpuid_, device_keys_ + cursor_, walk_degree_, tmp_len);
+  FeedGraphIns(cursor_, tmp_len, sample_res);
+  cursor_ += tmp_len;
+  return 1;
+}
+
 }  // namespace framework
 }  // namespace paddle
 #endif
@@ -56,6 +56,7 @@ namespace framework {
 class DataFeedDesc;
 class Scope;
 class Variable;
+class NeighborSampleResult;
 }  // namespace framework
 }  // namespace paddle
 
@@ -774,6 +775,38 @@ class DLManager {
   std::map<std::string, DLHandle> handle_map_;
 };
 
+class GraphDataGenerator {
+ public:
+  GraphDataGenerator() {};
+  ~GraphDataGenerator() {};
+  void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc) {
+    walk_degree_ = 1;
+    walk_len_ = 1;
+    sample_key_size_ = 8000;
+  };
+  void AllocResource(const paddle::platform::Place& place, std::vector<LoDTensor*> feed_vec, std::vector<int64_t>* h_device_keys);
+  void FeedGraphIns(size_t cursor, int len, NeighborSampleResult& sample_res);
+  int GenerateBatch();
+ protected:
+  int walk_degree_ = 1;
+  int walk_len_ = 1;
+  int sample_key_size_;
+  int gpuid_;
+  size_t device_key_size_;
+  size_t cursor_;
+  int64_t* device_keys_;
+  int64_t* id_tensor_ptr_;
+  int64_t* show_tensor_ptr_;
+  int64_t* clk_tensor_ptr_;
+  cudaStream_t stream_;
+  paddle::platform::Place place_;
+  std::vector<LoDTensor*> feed_vec_;
+  std::vector<int64_t>* h_device_keys_;
+  std::vector<size_t> offset_;
+  std::shared_ptr<phi::Allocation> d_prefix_sum_ = nullptr;
+  std::shared_ptr<phi::Allocation> d_device_keys_ = nullptr;
+};
+
 class DataFeed {
  public:
   DataFeed() {
@@ -836,6 +869,12 @@ class DataFeed {
   virtual void SetParseLogKey(bool parse_logkey) {}
   virtual void SetEnablePvMerge(bool enable_pv_merge) {}
   virtual void SetCurrentPhase(int current_phase) {}
+  virtual void SetDeviceKeys(std::vector<int64_t>* device_keys) {
+    h_device_keys_ = device_keys;
+  }
+  virtual void SetGpuGraphMode(int gpu_graph_mode) {
+    gpu_graph_mode_ = gpu_graph_mode;
+  }
   virtual void SetFileListMutex(std::mutex* mutex) {
     mutex_for_pick_file_ = mutex;
   }
@@ -919,6 +958,9 @@ class DataFeed {
 
   // The input type of pipe reader, 0 for one sample, 1 for one batch
   int input_type_;
+  int gpu_graph_mode_ = 0;
+  std::vector<int64_t>* h_device_keys_;
+  GraphDataGenerator gpu_graph_data_generator_;
 };
 
 // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
 
@@ -25,6 +25,7 @@
 
 #ifdef PADDLE_WITH_PSCORE
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #endif
 
 #if defined _WIN32 || defined __APPLE__
@@ -417,12 +418,30 @@ void DatasetImpl<T>::LoadIntoMemory() {
   platform::Timer timeline;
   timeline.Start();
   std::vector<std::thread> load_threads;
-  for (int64_t i = 0; i < thread_num_; ++i) {
-    load_threads.push_back(std::thread(
-        &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
-  }
-  for (std::thread& t : load_threads) {
-    t.join();
+  if (gpu_graph_mode_) {
+    VLOG(0) << "in gpu_graph_mode";
+    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+    gpu_graph_device_keys_ = gpu_graph_ptr->get_all_id(0, 0, thread_num_);
+    
+    for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) {
+      VLOG(0) << "gpu_graph_device_keys_[" << i << "] = " << gpu_graph_device_keys_[i].size(); 
+      for (size_t j = 0; j < gpu_graph_device_keys_[i].size(); j++) {
+        gpu_graph_total_keys_.push_back(gpu_graph_device_keys_[i][j]);
+      }
+    }
+    for (size_t i = 0; i < readers_.size(); i++) {
+      readers_[i]->SetDeviceKeys(&gpu_graph_device_keys_[i]);
+      readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+    }
+
+  } else {
+    for (int64_t i = 0; i < thread_num_; ++i) {
+      load_threads.push_back(std::thread(
+          &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
+    }
+    for (std::thread& t : load_threads) {
+      t.join();
+    }
   }
   input_channel_->Close();
   int64_t in_chan_size = input_channel_->Size();
 
@@ -158,7 +158,6 @@ class Dataset {
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
   // set fleet send sleep seconds
   virtual void SetFleetSendSleepSeconds(int seconds) = 0;
-
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
                                 const std::string& msg) = 0;
@@ -263,7 +262,9 @@ class DatasetImpl : public Dataset {
       return multi_consume_channel_;
     }
   }
-
+  std::vector<int64_t>& GetGpuGraphTotalKeys() {
+    return gpu_graph_total_keys_;
+  }
   Channel<T>& GetInputChannelRef() { return input_channel_; }
 
  protected:
@@ -322,6 +323,9 @@ class DatasetImpl : public Dataset {
   std::vector<std::shared_ptr<ThreadPool>> consume_task_pool_;
   std::vector<T> input_records_;  // only for paddleboxdatafeed
   bool enable_heterps_ = false;
+  int gpu_graph_mode_ = 1;
+  std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
+  std::vector<int64_t> gpu_graph_total_keys_;
 };
 
 // use std::vector<MultiSlotType> or Record as data type
Original file line number	Diff line number	Diff line change
`@@ -566,7 +566,7 @@ class GraphTable : public Table {`
`566`	`566`	`int32_t dump_edges_to_ssd(int idx);`
`567`	`567`	`int32_t get_partition_num(int idx) { return partitions[idx].size(); }`
`568`	`568`	`std::vector<int64_t> get_partition(int idx, int index) {`
`569`		`- if (idx >= partitions.size() \|\| index >= partitions[idx].size())`
	`569`	`+ if (idx >= (int)partitions.size() \|\| index >= (int)partitions[idx].size())`
`570`	`570`	`return std::vector<int64_t>();`
`571`	`571`	`return partitions[idx][index];`
`572`	`572`	`}`