PaddlePaddle
diff --git a/‎paddle/common/flags.cc‎
Lines changed: 10 additions & 0 deletions b/‎paddle/common/flags.cc‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/process_group_nccl.cc‎
Lines changed: 9 additions & 0 deletions b/‎paddle/fluid/distributed/collective/process_group_nccl.cc‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/process_group_nccl.h‎
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/distributed/collective/process_group_nccl.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/collective/process_group_with_stream.h‎
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/distributed/collective/process_group_with_stream.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/eager/CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎paddle/fluid/eager/CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
@@ -2140,6 +2140,16 @@ PHI_DEFINE_EXPORTED_bool(
     false,
     "Enable add lock when call AutoGrowthBestFitAllocator::ReleaseImpl");
 
+PHI_DEFINE_EXPORTED_int64(offload_retry_times, -1, "Offload retry times.");
+
+PHI_DEFINE_EXPORTED_bool(offload_inplace_tensor,
+                         true,
+                         "Whether to allow offload inplace tensor.");
+
+PHI_DEFINE_EXPORTED_bool(print_offload_info,
+                         false,
+                         "Whether to print the offload information.");
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /**
  * FlashAttention related FLAG
 
@@ -166,6 +166,15 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
   }
 }
 
+void ProcessGroupNCCL::EraseStream(const phi::DenseTensor& tensor) const {
+  if (!tensor.initialized()) return;
+  auto place = tensor.place();
+  auto iter = place_to_comm_ctx_.find(GetKeyFromPlace(place));
+  if (iter != place_to_comm_ctx_.end()) {
+    memory::EraseStream(tensor.Holder(), iter->second->stream());
+  }
+}
+
 void ProcessGroupNCCL::GroupStart() {
   NCCL_CHECK(phi::dynload::ncclGroupStart());
   ++s_group_call_counter;
 
@@ -92,6 +92,8 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
       std::shared_ptr<phi::distributed::NCCLConfig> nccl_config = nullptr);
   ~ProcessGroupNCCL();
 
+  void EraseStream(const phi::DenseTensor& tensor) const override;
+
   std::string GetBackendName() const override { return "NCCL"; }
 
   phi::DeviceContext* GetDeviceContext(const Place& place) const override;
 
@@ -60,6 +60,10 @@ class ProcessGroupWithStream : public ProcessGroup {
   ProcessGroupWithStream(int rank, int size, int gid)
       : ProcessGroup(rank, size, gid) {}
 
+  virtual void EraseStream(const phi::DenseTensor& tensor) const {
+    PADDLE_THROW(phi::errors::Unimplemented("EraseStream is not implemented."));
+  }
+
   virtual ~ProcessGroupWithStream() = default;
 
   std::shared_ptr<ProcessGroup::Task> AllGather(
 
@@ -13,6 +13,14 @@ set(eager_deps
     grad_tensor_holder
     custom_operator_node)
 
+if(WITH_GPU)
+  cc_library(
+    activation_offloader
+    SRCS activation_offloader.cc
+    DEPS phi_core phi_gpu)
+  list(APPEND eager_deps activation_offloader)
+endif()
+
 if(WITH_GPU OR WITH_ROCM)
   set(eager_deps ${eager_deps} phi_gpu)
 endif()