PaddlePaddle · zhiqiu · Jun 21, 2021 · Jun 8, 2021 · Jun 16, 2021 · Jun 16, 2021
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
             << dst_place;
     return;
   }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
 
 #ifdef PADDLE_WITH_MKLDNN
   auto size = src.layout() == DataLayout::kMKLDNN

diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
@@ -30,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num) {
   if (UNLIKELY(num == 0)) return;
+  VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
 }
 

diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
 
     auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
     bool use_align = context.Attr<bool>("use_align");
+    auto align_size = context.Attr<int>("align_size");
 
     if (context.Attr<bool>("check_name")) {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
@@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
         context.Attr<int>("dtype"));
     size_t size_of_dtype = framework::SizeOfType(dtype);
     GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
-                       context.GetPlace(), use_align);
+                       context.GetPlace(), use_align, align_size);
 
     // Alloc the continuous space
     auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
@@ -113,11 +114,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
         framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
                               &sub_tensor);
 
-        offset +=
-            use_align
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-                      size_of_dtype
-                : len;
+        offset += use_align
+                      ? platform::Alignment(len * size_of_dtype,
+                                            context.GetPlace(), align_size) /
+                            size_of_dtype
+                      : len;
       }
     } else if (context.Attr<bool>("set_constant")) {
       // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
@@ -134,40 +135,36 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
                                 &sub_tensor);
         }
-        offset +=
-            use_align
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
-                      size_of_dtype
-                : len;
+        offset += use_align
+                      ? platform::Alignment(len * size_of_dtype,
+                                            context.GetPlace(), align_size) /
+                            size_of_dtype
+                      : len;
       }
     }
 
     // Make the outputs point to the continuous space.
     offset = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    platform::NPUMemsetAsync(
-        static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
-        0.0, fused_tensor->numel() * sizeof(T), stream);
-#endif
+
     for (size_t i = 0; i < out_tensors.size(); ++i) {
       size_t len = static_cast<size_t>(out_tensors[i]->numel());
       auto dim = out_tensors[i]->dims();
+      VLOG(4) << len << " " << dim << " " << offset;
       out_tensors[i]
           ->ShareDataWith(fused_tensor->Slice(
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
       len = use_align
-                ? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                ? platform::Alignment(len * size_of_dtype, context.GetPlace(),
+                                      align_size) /
                       size_of_dtype
                 : len;
-      offset += len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
-         << " address: " << out_tensors[i]->data<void>() << ", ";
+         << " address: " << out_tensors[i]->data<void>() << " len: " << len
+         << ", ";
+      offset += len;
     }
     PADDLE_ENFORCE_EQ(
         (int64_t)offset, fused_tensor->numel(),
@@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
       const size_t &size_of_dtype, const platform::Place &place,
-      const bool use_align = true) const {
+      const bool use_align = true, const int align_size = -1) const {
     PADDLE_ENFORCE_EQ(
         lod_tensors.size(), var_names.size(),
         platform::errors::InvalidArgument(
@@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           size, 0,
           platform::errors::InvalidArgument(
               "The number of tensor `%s`'s elements is 0.", var_names[i]));
+      auto len =
+          use_align
+              ? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
+                                    place, align_size) /
+                    size_of_dtype
+              : static_cast<size_t>(size);
+      VLOG(4) << size << " " << len;
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
-         << " addres:" << lod_tensors[i]->data<void>() << ", ";
-
-      *numel += use_align
-                    ? platform::Alignment(
-                          static_cast<size_t>(size) * size_of_dtype, place) /
-                          size_of_dtype
-                    : static_cast<size_t>(size);
+         << " addres:" << lod_tensors[i]->data<void>() << " len: " << len
+         << ", ";
+      *numel += len;
     }
     VLOG(10) << ss.str();
   }
@@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->IsRuntime()) {
+      return;
+    }
+    auto use_align = ctx->Attrs().Get<bool>("use_align");
+    auto align_size = ctx->Attrs().Get<int>("align_size");
+
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        ctx->Attrs().Get<int>("dtype"));
+    size_t size_of_dtype = framework::SizeOfType(dtype);
+
+    auto alignment = [](size_t size, size_t align_size) {
+      size_t remaining = size % align_size;
+      auto aligned_size =
+          remaining == 0 ? size : size + (align_size - remaining);
+      VLOG(4) << remaining << " " << size << " " << align_size << " "
+              << aligned_size;
+      return aligned_size;
+    };
+    VLOG(4) << "align_size: " << align_size;
+    if (use_align && align_size > 0) {
+      int64_t numel = 0;
+      auto dims = ctx->GetInputsDim("Input");
+      for (const auto &dim : dims) {
+        auto size = framework::product(dim);
+        auto len = use_align
+                       ? alignment(static_cast<size_t>(size) * size_of_dtype,
+                                   align_size) /
+                             size_of_dtype
+                       : static_cast<size_t>(size);
+        numel += len;
+      }
+      ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel}));
+      VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel});
+    }
+  }
 
  protected:
   framework::OpKernelType GetKernelTypeForVar(
@@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Whether to consider memory chunk and take alignment into "
                   "account for inputs and outputs.")
         .SetDefault(true);
+    AddAttr<int>("align_size", "The alignment size when use_align is True")
+        .SetDefault(-1);
     AddComment(R"DOC(
 CoalesceTensor Operator.
 
@@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+REGISTER_OP_CUDA_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
+#endif
+
 #ifdef PADDLE_WITH_XPU
 REGISTER_OP_XPU_KERNEL(
     coalesce_tensor,
@@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor)
             "In order to optionally take memory alignment into account when "
             "coalescing tensors. The default value is true to be compatible "
             "with before.",
-            true));
+            true))
+    .AddCheckpoint(
+        R"ROC(
+                Upgrade coalesce_tensor: add a new attribute [align_size].)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "align_size",
+            "In order to optionally take memory alignment into account when "
+            "coalescing tensors. The default value is -1 and use the default "
+            "align_size "
+            "of each place to be compatible with before.",
+            -1));
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
@@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
 
     int n = static_cast<int>(x.size());
-    PADDLE_ENFORCE_EQ(n > 1, true,
-                      platform::errors::InvalidArgument(
-                          "The size of Input(x) list must larger or equal 2"));
+
+    if (n == 1) {
+      TensorCopy(*x[0], place, out);
+      return;
+    }
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()

diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
@@ -16,22 +16,26 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
-size_t Alignment(size_t size, const platform::Place &place) {
-  size_t alignment = 1024;
-  if (platform::is_cpu_place(place)) {
-    alignment = CpuMinChunkSize();
+size_t Alignment(size_t size, const platform::Place &place, int align_size) {
+  size_t alignment = 0;
+  if (align_size > 0) {
+    alignment = align_size;
   } else {
+    alignment = 1024;
+    if (platform::is_cpu_place(place)) {
+      alignment = CpuMinChunkSize();
+    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    alignment = GpuMinChunkSize();
+      alignment = GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
-    // TODO(wangxi): add XpuMinChunkSize
-    alignment = alignment;
+      alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
-    alignment = NPUMinChunkSize();
+      alignment = NPUMinChunkSize();
 #else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Fluid is not compiled with CUDA or NPU."));
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Fluid is not compiled with CUDA/XPU/NPU."));
 #endif
+    }
   }
   size_t remaining = size % alignment;
   return remaining == 0 ? size : size + (alignment - remaining);

diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
@@ -22,9 +22,13 @@ limitations under the License. */
 #elif defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/npu_info.h"
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/npu_info.h"
+#endif
 
 namespace paddle {
 namespace platform {
-size_t Alignment(size_t size, const platform::Place &place);
+size_t Alignment(size_t size, const platform::Place &place,
+                 int align_size = -1);
 }  // namespace platform
 }  // namespace paddle