PaddlePaddle · gongweibao · Mar 13, 2024 · Mar 8, 2024 · Mar 10, 2024 · Mar 10, 2024
diff --git a/paddle/fluid/memory/allocation/memory_block.cc b/paddle/fluid/memory/allocation/memory_block.cc
@@ -43,7 +43,9 @@ MemoryBlock* MemoryBlock::GetRightBuddy(MetadataCache* cache) {
   return cache->LoadDesc(this)->right_buddy;
 }
 
-void MemoryBlock::Split(MetadataCache* cache, size_t size) {
+void MemoryBlock::Split(MetadataCache* cache,
+                        size_t size,
+                        size_t extra_padding_size) {
   auto desc = cache->LoadDesc(this);
   // make sure the split fits
   PADDLE_ENFORCE_GE(desc->total_size,
@@ -54,8 +56,10 @@ void MemoryBlock::Split(MetadataCache* cache, size_t size) {
                         desc->total_size,
                         size));
 
+  size_t pay_load_size = sizeof(MemoryBlock::Desc) + extra_padding_size;
+
   // bail out if there is no room for another partition
-  if (desc->total_size - size <= sizeof(MemoryBlock::Desc)) {
+  if (desc->total_size - size <= pay_load_size) {
     return;
   }
 
@@ -71,13 +75,13 @@ void MemoryBlock::Split(MetadataCache* cache, size_t size) {
   cache->Save(static_cast<MemoryBlock*>(right_partition),
               MemoryBlock::Desc(FREE_CHUNK,
                                 desc->index,
-                                remaining_size - sizeof(MemoryBlock::Desc),
+                                remaining_size - pay_load_size,
                                 remaining_size,
                                 this,
                                 new_block_right_buddy));
 
   desc->right_buddy = static_cast<MemoryBlock*>(right_partition);
-  desc->size = size - sizeof(MemoryBlock::Desc);
+  desc->size = size - pay_load_size;
   desc->total_size = size;
 
   desc->UpdateGuards();

diff --git a/paddle/fluid/memory/allocation/memory_block.h b/paddle/fluid/memory/allocation/memory_block.h
@@ -50,7 +50,7 @@ struct MemoryBlock {
   MemoryBlock* GetRightBuddy(MetadataCache* cache);
 
   // Split the allocation into left/right blocks.
-  void Split(MetadataCache* cache, size_t size);
+  void Split(MetadataCache* cache, size_t size, size_t extra_padding_size = 0);
 
   // Merge left and right blocks together.
   void Merge(MetadataCache* cache, MemoryBlock* right_buddy);

diff --git a/test/custom_runtime/process_group_xccl.py b/test/custom_runtime/process_group_xccl.py
@@ -68,7 +68,7 @@ def test_create_process_group_xccl(self):
             task.wait()
             # assert np.array_equal(tensor_y, sum_result)
 
-        print("test allreduce sum api ok")
+        print("test allreduce sum api ok", flush=True)
 
         x = np.random.random(self.shape).astype(self.dtype)
         tensor_x = paddle.to_tensor(x)
@@ -86,7 +86,7 @@ def test_create_process_group_xccl(self):
             task.wait()
             # assert np.array_equal(tensor_y, max_result)
 
-        print("test allreduce max api ok")
+        print("test allreduce max api ok", flush=True)
 
         # test broadcast
         # rank 0
@@ -110,7 +110,7 @@ def test_create_process_group_xccl(self):
             assert task.is_completed()
             # assert np.array_equal(broadcast_result, tensor_y)
 
-        print("test broadcast api ok")
+        print("test broadcast api ok", flush=True)
 
         # test barrier
         # rank 0
@@ -122,7 +122,7 @@ def test_create_process_group_xccl(self):
             task = pg.barrier(device_id)
             task.wait()
 
-        print("test barrier api ok\n")
+        print("test barrier api ok\n", flush=True)
         return
 
         # test allgather
@@ -150,7 +150,7 @@ def test_create_process_group_xccl(self):
         )
         # assert np.array_equal(tensor_x, out_1)
         # assert np.array_equal(tensor_y, out_2)
-        print("test allgather api ok\n")
+        print("test allgather api ok\n", flush=True)
 
         # test alltoall
         # rank 0
@@ -183,7 +183,7 @@ def test_create_process_group_xccl(self):
         #     assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
         # else:
         #     assert np.array_equal(out2_1, raw_tensor_x_2)
-        print("test alltoall api ok\n")
+        print("test alltoall api ok\n", flush=True)
 
         # test Reduce
         # rank 0
@@ -203,7 +203,7 @@ def test_create_process_group_xccl(self):
             # paddle.base.core._custom_device_synchronize("custom_cpu", -1)
         # if pg.rank() == 0:
         #     assert np.array_equal(tensor_x, sum_result)
-        print("test reduce sum api ok\n")
+        print("test reduce sum api ok\n", flush=True)
 
         # test Scatter
         # rank 0
@@ -228,7 +228,7 @@ def test_create_process_group_xccl(self):
         #     assert np.array_equal(tensor_y, out1)
         # else:
         #     assert np.array_equal(tensor_y, out2)
-        print("test scatter api ok\n")
+        print("test scatter api ok\n", flush=True)
 
 
 if __name__ == "__main__":