Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 110 additions & 132 deletions paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,167 +35,149 @@ using paddle::platform::MKLDNNDeviceContext;
using platform::to_void_cast;

template <typename T>
class BatchNormMKLDNNHandler
: public platform::MKLDNNHandlerT<T, mkldnn::batch_normalization_forward,
mkldnn::batch_normalization_backward> {
class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT<
T, mkldnn::batch_normalization_forward,
mkldnn::batch_normalization_backward> {
public:
BatchNormMKLDNNHandler(const paddle::framework::ExecutionContext &ctx,
const platform::MKLDNNDeviceContext &dev_ctx,
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const Tensor *x,
const bool global_stats, const bool test_mode,
const std::string &unique_name)
: platform::MKLDNNHandlerT<T, mkldnn::batch_normalization_forward,
mkldnn::batch_normalization_backward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
unique_name)) {
if (!this->isCached()) {
const float epsilon = ctx.Attr<float>("epsilon");
const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");

std::vector<std::string> DataLayout_error_msg = {"kNHWC", "kNCHW",
"kAnyLayout", "kMKLDNN"};
PADDLE_ENFORCE_EQ(
x->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
"Wrong layout set for X tensor. Expected layout is `kMKLDNN`, "
"But received %s.",
DataLayout_error_msg[static_cast<int>(DataLayout::kMKLDNN)]));
PADDLE_ENFORCE_NE(
x->format(), MKLDNNMemoryFormat::undef,
platform::errors::InvalidArgument("Wrong format set for X tensor"));

auto src_tz = paddle::framework::vectorize(x->dims());

// Flags are added by bitwise OR operation
auto flags = mkldnn::normalization_flags::use_scale_shift; // 001
if (global_stats)
flags |= mkldnn::normalization_flags::use_global_stats; // 010
if (fuse_with_relu && test_mode)
flags |= mkldnn::normalization_flags::fuse_norm_relu; // 100

auto md = mkldnn::memory::desc(
src_tz, platform::MKLDNNGetDataType<T>(),
platform::MKLDNNFormatForSize(src_tz.size(), x->format()));

this->AcquireForwardPrimitiveDescriptor(
global_stats == true ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training,
md, epsilon, flags);
}
const mkldnn::engine mkldnn_engine, const Tensor *x,
const bool global_stats, const bool test_mode)
: platform::MKLDNNHandlerNoCachingT<T,
mkldnn::batch_normalization_forward,
mkldnn::batch_normalization_backward>(
mkldnn_engine, ctx.GetPlace()) {
const float epsilon = ctx.Attr<float>("epsilon");
const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");

std::vector<std::string> DataLayout_error_msg = {"kNHWC", "kNCHW",
"kAnyLayout", "kMKLDNN"};
PADDLE_ENFORCE_EQ(
x->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
"Wrong layout set for X tensor. Expected layout is `kMKLDNN`, "
"But received %s.",
DataLayout_error_msg[static_cast<int>(DataLayout::kMKLDNN)]));
PADDLE_ENFORCE_NE(
x->format(), MKLDNNMemoryFormat::undef,
platform::errors::InvalidArgument("Wrong format set for X tensor"));

auto src_tz = paddle::framework::vectorize(x->dims());

// Flags are added by bitwise OR operation
auto flags = mkldnn::normalization_flags::use_scale_shift; // 001
if (global_stats)
flags |= mkldnn::normalization_flags::use_global_stats; // 010
if (fuse_with_relu && test_mode)
flags |= mkldnn::normalization_flags::fuse_norm_relu; // 100

auto md = mkldnn::memory::desc(
src_tz, platform::MKLDNNGetDataType<T>(),
platform::MKLDNNFormatForSize(src_tz.size(), x->format()));

this->AcquireForwardPrimitiveDescriptor(
global_stats == true ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training,
md, epsilon, flags);
}

BatchNormMKLDNNHandler(const paddle::framework::ExecutionContext &ctx,
const platform::MKLDNNDeviceContext &dev_ctx,
platform::Place cpu_place, const Tensor *in_x,
const Tensor *scale, const Tensor *out_grad,
const std::string &unique_name)
: platform::MKLDNNHandlerT<T, mkldnn::batch_normalization_forward,
mkldnn::batch_normalization_backward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()),
unique_name)) {
if (!this->isBwdCached()) {
PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
"Wrong layout set for Input out_grad tensor"));
PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
platform::errors::InvalidArgument(
"Wrong format set for Input out_grad tensor"));

auto src_tz = paddle::framework::vectorize<int64_t>(in_x->dims());
auto scale_tz = paddle::framework::vectorize<int64_t>(scale->dims());
PADDLE_ENFORCE_EQ(
scale_tz.size(), 1,
platform::errors::InvalidArgument(
"Dims of scale tensor must be 1, but received scale's size is %d",
scale_tz.size()));

MKLDNNMemoryFormat diff_fmt =
platform::MKLDNNFormatForSize(src_tz.size(), out_grad->format());

MKLDNNMemoryFormat src_fmt =
platform::MKLDNNFormatForSize(src_tz.size(), in_x->format());

auto dims = framework::vectorize(in_x->dims());
auto diff_dst_md = mkldnn::memory::desc(
dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
auto src_md =
mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), src_fmt);

const float epsilon = ctx.Attr<float>("epsilon");

this->AcquireForwardPrimitiveDescriptor(
mkldnn::prop_kind::forward_training, src_md, epsilon,
mkldnn::normalization_flags::use_scale_shift);
this->AcquireBackwardPrimitiveDescriptor(
mkldnn::prop_kind::backward, diff_dst_md, src_md, epsilon,
mkldnn::normalization_flags::use_scale_shift);
}
const mkldnn::engine mkldnn_engine, const Tensor *in_x,
const Tensor *scale, const Tensor *out_grad)
: platform::MKLDNNHandlerNoCachingT<T,
mkldnn::batch_normalization_forward,
mkldnn::batch_normalization_backward>(
mkldnn_engine, ctx.GetPlace()) {
PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument(
"Wrong layout set for Input out_grad tensor"));
PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
platform::errors::InvalidArgument(
"Wrong format set for Input out_grad tensor"));

auto src_tz = paddle::framework::vectorize<int64_t>(in_x->dims());
auto scale_tz = paddle::framework::vectorize<int64_t>(scale->dims());
PADDLE_ENFORCE_EQ(
scale_tz.size(), 1,
platform::errors::InvalidArgument(
"Dims of scale tensor must be 1, but received scale's size is %d",
scale_tz.size()));

MKLDNNMemoryFormat diff_fmt =
platform::MKLDNNFormatForSize(src_tz.size(), out_grad->format());

MKLDNNMemoryFormat src_fmt =
platform::MKLDNNFormatForSize(src_tz.size(), in_x->format());

auto dims = framework::vectorize(in_x->dims());
auto diff_dst_md =
mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
auto src_md =
mkldnn::memory::desc(dims, platform::MKLDNNGetDataType<T>(), src_fmt);

const float epsilon = ctx.Attr<float>("epsilon");

this->AcquireForwardPrimitiveDescriptor(
mkldnn::prop_kind::forward_training, src_md, epsilon,
mkldnn::normalization_flags::use_scale_shift);
this->AcquireBackwardPrimitiveDescriptor(
mkldnn::prop_kind::backward, diff_dst_md, src_md, epsilon,
mkldnn::normalization_flags::use_scale_shift);
}

std::shared_ptr<mkldnn::memory> AcquireScaleShiftMemory(const Tensor *scale,
const Tensor *shift,
const bool is_test) {
auto scaleshift_memory = this->AcquireMemory("@scaleshift_mem_p");
if (scaleshift_memory == nullptr || !is_test) {
auto scale_tz = paddle::framework::vectorize(scale->dims());
const unsigned int C = scale_tz[0];
PADDLE_ENFORCE_EQ(
scale_tz.size(), 1,
platform::errors::InvalidArgument(
"Dims of scale tensor must be 1, but received scale's size is %d",
scale_tz.size()));

auto mem_p = this->AcquireMemoryFromPrimitive(
this->fwd_pd_->weights_desc(), "@scaleshift_mem_p");

// MKLDNN requires a single piece of memory for scale and shift/bias data
auto mem_data_handle = reinterpret_cast<T *>(mem_p->get_data_handle());
std::copy(scale->data<T>(), scale->data<T>() + C, mem_data_handle);
std::copy(shift->data<T>(), shift->data<T>() + C, mem_data_handle + C);

return mem_p;
}
const Tensor *shift) {
auto scale_tz = paddle::framework::vectorize(scale->dims());
const unsigned int C = scale_tz[0];
PADDLE_ENFORCE_EQ(
scale_tz.size(), 1,
platform::errors::InvalidArgument(
"Dims of scale tensor must be 1, but received scale's size is %d",
scale_tz.size()));

auto scaleshift_memory =
this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc());

// MKLDNN requires a single piece of memory for scale and shift/bias data
auto mem_data_handle =
reinterpret_cast<T *>(scaleshift_memory->get_data_handle());
std::copy(scale->data<T>(), scale->data<T>() + C, mem_data_handle);
std::copy(shift->data<T>(), shift->data<T>() + C, mem_data_handle + C);
return scaleshift_memory;
}

std::shared_ptr<mkldnn::memory> AcquireDiffScaleShiftMemory(
T *diff_scaleshift_data) {
return this->AcquireMemoryFromPrimitive(this->bwd_pd_->diff_weights_desc(),
diff_scaleshift_data,
"@diff_scaleshift_mem_p");
diff_scaleshift_data);
}

std::shared_ptr<mkldnn::memory> AcquireMeanMemory(
const framework::Tensor *mean) {
const T *mean_data = mean->data<T>();
return this->AcquireMemoryFromPrimitive(
this->fwd_pd_->mean_desc(), to_void_cast<T>(mean_data), "@mean_mem_p");
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
to_void_cast<T>(mean_data));
}

std::shared_ptr<mkldnn::memory> AcquireMeanMemory(framework::Tensor *mean) {
T *mean_data = mean->mutable_data<T>(this->place_,
this->fwd_pd_->mean_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
mean_data, "@mean_mem_p");
mean_data);
}

std::shared_ptr<mkldnn::memory> AcquireVarianceMemory(
const framework::Tensor *variance) {
const T *variance_data = variance->data<T>();
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
to_void_cast<T>(variance_data),
"@variance_mem_p");
to_void_cast<T>(variance_data));
}

std::shared_ptr<mkldnn::memory> AcquireVarianceMemory(
framework::Tensor *variance) {
T *variance_data = variance->mutable_data<T>(
this->place_, this->fwd_pd_->variance_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
variance_data, "@variance_mem_p");
variance_data);
}
};

Expand All @@ -220,13 +202,11 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto *batch_mean = ctx.Output<Tensor>("SavedMean");
auto *batch_variance = ctx.Output<Tensor>("SavedVariance");

BatchNormMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine,
ctx.GetPlace(), x, global_stats,
test_mode, ctx.OutputName("SavedMean"));
BatchNormMKLDNNHandler<T> handler(ctx, mkldnn_engine, x, global_stats,
test_mode);

auto src_memory = handler.AcquireSrcMemory(x);
auto scaleshift_memory =
handler.AcquireScaleShiftMemory(scale, shift, is_test);
auto scaleshift_memory = handler.AcquireScaleShiftMemory(scale, shift);
auto dst_memory = handler.AcquireDstMemory(y);

auto batch_norm_p = handler.AcquireForwardPrimitive();
Expand Down Expand Up @@ -303,8 +283,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));

BatchNormMKLDNNHandler<T> handler(ctx, dev_ctx, ctx.GetPlace(), x, scale,
diff_y, ctx.InputName("SavedMean"));
BatchNormMKLDNNHandler<T> handler(ctx, mkldnn_engine, x, scale, diff_y);

// MKLDNN requires a single piece of memory for scale and shift/bias data
const unsigned int C = paddle::framework::vectorize(scale->dims())[0];
Expand All @@ -316,8 +295,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto mean_memory = handler.AcquireMeanMemory(batch_mean);
auto variance_memory = handler.AcquireVarianceMemory(batch_variance);
auto diff_dst_memory = handler.AcquireDiffDstMemory(diff_y);
auto scaleshift_memory =
handler.AcquireScaleShiftMemory(scale, shift, false);
auto scaleshift_memory = handler.AcquireScaleShiftMemory(scale, shift);
auto diff_src_memory = handler.AcquireDiffSrcMemory(diff_x);
auto diff_scaleshift_memory =
handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data());
Expand Down
38 changes: 15 additions & 23 deletions paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,21 @@ using platform::to_void_cast;

template <typename T = float>
class InterpolateMKLDNNHandler
: public platform::MKLDNNHandlerT<T, dnnl::resampling_forward> {
: public platform::MKLDNNHandlerNoCachingT<T, dnnl::resampling_forward> {
public:
InterpolateMKLDNNHandler(const dnnl::algorithm algo,
const platform::MKLDNNDeviceContext& dev_ctx,
const dnnl::engine engine, platform::Place cpu_place,
const Tensor* x, Tensor* z,
const std::string& uniq_name)
: platform::MKLDNNHandlerT<T, dnnl::resampling_forward>(
dev_ctx, engine, cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
uniq_name)) {
if (!this->isCached()) {
const auto src_x_tz = framework::vectorize(x->dims());
const auto dst_tz = framework::vectorize(z->dims());
const auto src_md = dnnl::memory::desc(
src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any);
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, algo, src_md, dst_md);
}
const Tensor* x, Tensor* z)
: platform::MKLDNNHandlerNoCachingT<T, dnnl::resampling_forward>(
engine, cpu_place) {
const auto src_x_tz = framework::vectorize(x->dims());
const auto dst_tz = framework::vectorize(z->dims());
const auto src_md = dnnl::memory::desc(
src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any);
this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_inference,
algo, src_md, dst_md);
}
};

Expand Down Expand Up @@ -145,7 +139,6 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
const auto& mkldnn_engine = dev_ctx.GetEngine();

const auto* x = ctx.Input<Tensor>("X");
std::vector<float> scale_prior;
auto* z = ctx.Output<Tensor>("Out");

auto interp_method = ctx.Attr<std::string>("interp_method");
Expand All @@ -155,11 +148,10 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {

auto out_dims_vec = ComputeOutputShape(ctx);
framework::DDim dim_out = framework::make_ddim(out_dims_vec);
z->mutable_data<T>(dim_out, ctx.GetPlace());
z->Resize(dim_out);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you earlier need this "mutable_data" call and why don't you need it now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great question. mutable data is not needed here as it is called internally inside AcquireDstMemory so having it here(as deleted) was creating situation that it was called twice which was not needed and in some situations it could cause performance drop


InterpolateMKLDNNHandler<T> handler(algo, dev_ctx, mkldnn_engine,
ctx.GetPlace(), x, z,
ctx.OutputName("Out"));
InterpolateMKLDNNHandler<T> handler(algo, mkldnn_engine, ctx.GetPlace(), x,
z);

auto src_memory_p = handler.AcquireSrcMemory(x);
auto dst_memory_p = handler.AcquireDstMemory(z);
Expand Down