Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place;
return;
}
VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;

#ifdef PADDLE_WITH_MKLDNN
auto size = src.layout() == DataLayout::kMKLDNN
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/memory/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace,
const void* src, size_t num) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
std::memcpy(dst, src, num);
}

Expand Down
121 changes: 89 additions & 32 deletions paddle/fluid/operators/coalesce_tensor_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {

auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
bool use_align = context.Attr<bool>("use_align");
auto align_size = context.Attr<int>("align_size");

if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
Expand All @@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
context.Attr<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace(), use_align);
context.GetPlace(), use_align, align_size);

// Alloc the continuous space
auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
Expand All @@ -113,11 +114,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);

offset +=
use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype
: len;
offset += use_align
? platform::Alignment(len * size_of_dtype,
context.GetPlace(), align_size) /
size_of_dtype
: len;
}
} else if (context.Attr<bool>("set_constant")) {
// TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
Expand All @@ -134,40 +135,36 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);
}
offset +=
use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype
: len;
offset += use_align
? platform::Alignment(len * size_of_dtype,
context.GetPlace(), align_size) /
size_of_dtype
: len;
}
}

// Make the outputs point to the continuous space.
offset = 0;
std::stringstream ss;
ss << "alloc_space_for_vars: ";
#if defined(PADDLE_WITH_ASCEND_CL)
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
platform::NPUMemsetAsync(
static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
0.0, fused_tensor->numel() * sizeof(T), stream);
#endif

for (size_t i = 0; i < out_tensors.size(); ++i) {
size_t len = static_cast<size_t>(out_tensors[i]->numel());
auto dim = out_tensors[i]->dims();
VLOG(4) << len << " " << dim << " " << offset;
out_tensors[i]
->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
? platform::Alignment(len * size_of_dtype, context.GetPlace(),
align_size) /
size_of_dtype
: len;
offset += len;
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", ";
<< " address: " << out_tensors[i]->data<void>() << " len: " << len
<< ", ";
offset += len;
}
PADDLE_ENFORCE_EQ(
(int64_t)offset, fused_tensor->numel(),
Expand All @@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel,
const size_t &size_of_dtype, const platform::Place &place,
const bool use_align = true) const {
const bool use_align = true, const int align_size = -1) const {
PADDLE_ENFORCE_EQ(
lod_tensors.size(), var_names.size(),
platform::errors::InvalidArgument(
Expand All @@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
size, 0,
platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i]));
auto len =
use_align
? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place, align_size) /
size_of_dtype
: static_cast<size_t>(size);
VLOG(4) << size << " " << len;
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< ") "
<< " addres:" << lod_tensors[i]->data<void>() << ", ";

*numel += use_align
? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype
: static_cast<size_t>(size);
<< " addres:" << lod_tensors[i]->data<void>() << " len: " << len
<< ", ";
*numel += len;
}
VLOG(10) << ss.str();
}
Expand All @@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext *ctx) const override {}
void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
auto use_align = ctx->Attrs().Get<bool>("use_align");
auto align_size = ctx->Attrs().Get<int>("align_size");

auto dtype = static_cast<framework::proto::VarType::Type>(
ctx->Attrs().Get<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);

auto alignment = [](size_t size, size_t align_size) {
size_t remaining = size % align_size;
auto aligned_size =
remaining == 0 ? size : size + (align_size - remaining);
VLOG(4) << remaining << " " << size << " " << align_size << " "
<< aligned_size;
return aligned_size;
};
VLOG(4) << "align_size: " << align_size;
if (use_align && align_size > 0) {
int64_t numel = 0;
auto dims = ctx->GetInputsDim("Input");
for (const auto &dim : dims) {
auto size = framework::product(dim);
auto len = use_align
? alignment(static_cast<size_t>(size) * size_of_dtype,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
numel += len;
}
ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel}));
VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel});
}
}

protected:
framework::OpKernelType GetKernelTypeForVar(
Expand Down Expand Up @@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
"Whether to consider memory chunk and take alignment into "
"account for inputs and outputs.")
.SetDefault(true);
AddAttr<int>("align_size", "The alignment size when use_align is True")
.SetDefault(-1);
AddComment(R"DOC(
CoalesceTensor Operator.

Expand Down Expand Up @@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL(
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif

#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
#endif

#ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL(
coalesce_tensor,
Expand Down Expand Up @@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor)
"In order to optionally take memory alignment into account when "
"coalescing tensors. The default value is true to be compatible "
"with before.",
true));
true))
.AddCheckpoint(
R"ROC(
Upgrade coalesce_tensor: add a new attribute [align_size].)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"align_size",
"In order to optionally take memory alignment into account when "
"coalescing tensors. The default value is -1 and use the default "
"align_size "
"of each place to be compatible with before.",
-1));
8 changes: 5 additions & 3 deletions paddle/fluid/operators/sum_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel<T> {
auto place = ctx.GetPlace();

int n = static_cast<int>(x.size());
PADDLE_ENFORCE_EQ(n > 1, true,
platform::errors::InvalidArgument(
"The size of Input(x) list must larger or equal 2"));

if (n == 1) {
TensorCopy(*x[0], place, out);
return;
}

auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
Expand Down
24 changes: 14 additions & 10 deletions paddle/fluid/platform/device_memory_aligment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,26 @@ limitations under the License. */

namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place) {
size_t alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
size_t Alignment(size_t size, const platform::Place &place, int align_size) {
size_t alignment = 0;
if (align_size > 0) {
alignment = align_size;
} else {
alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize();
alignment = GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU)
// TODO(wangxi): add XpuMinChunkSize
alignment = alignment;
alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL)
alignment = NPUMinChunkSize();
alignment = NPUMinChunkSize();
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA or NPU."));
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA/XPU/NPU."));
#endif
}
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/platform/device_memory_aligment.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,13 @@ limitations under the License. */
#elif defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h"
#endif

namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place);
size_t Alignment(size_t size, const platform::Place &place,
int align_size = -1);
} // namespace platform
} // namespace paddle
Loading