From 8aa132b5f849bff190241235f5ea17f828d7ca44 Mon Sep 17 00:00:00 2001
From: ronny1996 <524019753@qq.com>
Date: Fri, 9 Jul 2021 07:33:55 +0000
Subject: [PATCH 1/4] add pool2d_op_npu and test

---
 paddle/fluid/operators/pool_op_npu.cc         | 289 +++++++
 .../tests/unittests/npu/test_pool2d_op_npu.py | 710 ++++++++++++++++++
 2 files changed, 999 insertions(+)
 create mode 100644 paddle/fluid/operators/pool_op_npu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
new file mode 100644
index 00000000000000..a5031401ca949c
--- /dev/null
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -0,0 +1,289 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class NPUPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
+    const Tensor *in_x = ctx.Input<Tensor>("X");
+    Tensor *out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::string data_format = ctx.Attr<std::string>("data_format");
+
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
+    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+    bool exclusive = ctx.Attr<bool>("exclusive");
+    bool adaptive = ctx.Attr<bool>("adaptive");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+
+    const bool channel_last = data_format == "NHWC";
+
+    auto in_x_dims = in_x->dims();
+    auto out_dims = out->dims();
+    framework::DDim data_dims;
+    framework::DDim out_data_dims;
+    Tensor in_x_tensor, out_tensor;
+    in_x_tensor.ShareDataWith(*in_x);
+    out_tensor.ShareDataWith(*out);
+    std::vector<int> ksize_vec(4, 1);
+    std::vector<int> strides_vec(4, 1);
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+      out_data_dims = framework::slice_ddim(out_dims, 2, out_dims.size());
+      ksize_vec[1] = ksize[0];
+      ksize_vec[2] = ksize[1];
+      strides_vec[1] = strides[0];
+      strides_vec[2] = strides[1];
+      in_x_tensor.set_layout(DataLayout::kNHWC);
+      out_tensor.set_layout(DataLayout::kNHWC);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+      out_data_dims = framework::slice_ddim(out_dims, 2, out_dims.size());
+      ksize_vec[2] = ksize[0];
+      ksize_vec[3] = ksize[1];
+      strides_vec[2] = strides[0];
+      strides_vec[3] = strides[1];
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    PADDLE_ENFORCE_LT(
+        std::max(paddings[0], paddings[1]), ksize[0],
+        platform::errors::InvalidArgument(
+            "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.",
+            ksize[0], std::max(paddings[0], paddings[1])));
+    PADDLE_ENFORCE_LT(
+        std::max(paddings[2], paddings[3]), ksize[1],
+        platform::errors::InvalidArgument(
+            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
+            ksize[1], std::max(paddings[2], paddings[3])));
+    if (adaptive) {
+      PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], 0,
+                        platform::errors::InvalidArgument(
+                            "When adaptive = True, the H and W of input must "
+                            "be divisible by the output, "
+                            "but x dims is %s, out dims is %s",
+                            data_dims, out_data_dims));
+      PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1], 0,
+                        platform::errors::InvalidArgument(
+                            "When adaptive = True, the H and W of input must "
+                            "be divisible by the output,, "
+                            "but x dims is %s, out dims is %s",
+                            data_dims, out_data_dims));
+      if (channel_last) {
+        strides_vec[1] = data_dims[0] / out_data_dims[0];
+        strides_vec[2] = data_dims[1] / out_data_dims[1];
+        ksize_vec[1] = strides_vec[1];
+        ksize_vec[2] = strides_vec[2];
+      } else {
+        strides_vec[2] = data_dims[0] / out_data_dims[0];
+        strides_vec[3] = data_dims[1] / out_data_dims[1];
+        ksize_vec[2] = strides_vec[2];
+        ksize_vec[3] = strides_vec[3];
+      }
+    }
+
+    std::string pooling_mode = "AvgPoolV2";
+    if (pooling_type == "max") {
+      PADDLE_ENFORCE_EQ(
+          exclusive, true,
+          platform::errors::InvalidArgument(
+              "MaxPool only support exclusive=false, but got true"));
+      pooling_mode = "MaxPoolV3";
+    }
+    const auto &runner =
+        NpuOpRunner(pooling_mode, {in_x_tensor}, {out_tensor},
+                    {{"ksize", ksize_vec},
+                     {"strides", strides_vec},
+                     {"padding_mode", std::string("CALCULATED")},
+                     {"pads", paddings},
+                     {"data_format", data_format},
+                     {"global_pooling", global_pooling},
+                     {"ceil_mode", ceil_mode},
+                     {"exclusive", exclusive}});
+    auto stream = dev_ctx.stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class NPUPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
+    const Tensor *in_x = ctx.Input<Tensor>("X");
+    const Tensor *out = ctx.Input<Tensor>("Out");
+    const Tensor *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    in_x_grad->mutable_data<T>(ctx.GetPlace());
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+    bool exclusive = ctx.Attr<bool>("exclusive");
+    bool adaptive = ctx.Attr<bool>("adaptive");
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    bool global_pooling = ctx.Attr<bool>("global_pooling");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+
+    const bool channel_last = data_format == "NHWC";
+
+    // update paddings
+    auto in_x_dims = in_x->dims();
+    auto out_dims = out->dims();
+    framework::DDim data_dims;
+    framework::DDim out_data_dims;
+    std::vector<int> ksize_vec(4, 1);
+    std::vector<int> strides_vec(4, 1);
+    Tensor in_x_tensor, out_tensor, out_grad_tensor, in_x_grad_tensor;
+    in_x_tensor.ShareDataWith(*in_x);
+    out_tensor.ShareDataWith(*out);
+    out_grad_tensor.ShareDataWith(*out_grad);
+    in_x_grad_tensor.ShareDataWith(*in_x_grad);
+    if (channel_last) {
+      data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
+      out_data_dims = framework::slice_ddim(out_dims, 1, out_dims.size() - 1);
+      ksize_vec[1] = ksize[0];
+      ksize_vec[2] = ksize[1];
+      strides_vec[1] = strides[0];
+      strides_vec[2] = strides[1];
+      in_x_tensor.set_layout(DataLayout::kNHWC);
+      out_tensor.set_layout(DataLayout::kNHWC);
+      out_grad_tensor.set_layout(DataLayout::kNHWC);
+      in_x_grad_tensor.set_layout(DataLayout::kNHWC);
+    } else {
+      data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size());
+      out_data_dims = framework::slice_ddim(out_dims, 2, out_dims.size());
+      ksize_vec[2] = ksize[0];
+      ksize_vec[3] = ksize[1];
+      strides_vec[2] = strides[0];
+      strides_vec[3] = strides[1];
+    }
+    UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
+                  data_dims, strides, ksize);
+    if (global_pooling) {
+      adaptive = true;
+    }
+    PADDLE_ENFORCE_LT(
+        std::max(paddings[0], paddings[1]), ksize[0],
+        platform::errors::InvalidArgument(
+            "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.",
+            ksize[0], std::max(paddings[0], paddings[1])));
+    PADDLE_ENFORCE_LT(
+        std::max(paddings[2], paddings[3]), ksize[1],
+        platform::errors::InvalidArgument(
+            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
+            ksize[1], std::max(paddings[2], paddings[3])));
+
+    if (adaptive) {
+      PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], 0,
+                        platform::errors::InvalidArgument(
+                            "When adaptive = True, H and W must be divisible, "
+                            "but input dims is %s, output dims is %s",
+                            data_dims, out_data_dims));
+      PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1], 0,
+                        platform::errors::InvalidArgument(
+                            "When adaptive = True, H and W must be divisible, "
+                            "but input dims is %s, output dims is %s",
+                            data_dims, out_data_dims));
+      if (channel_last) {
+        strides_vec[1] = data_dims[0] / out_data_dims[0];
+        strides_vec[2] = data_dims[1] / out_data_dims[1];
+        ksize_vec[1] = strides_vec[1];
+        ksize_vec[2] = strides_vec[2];
+      } else {
+        strides_vec[2] = data_dims[0] / out_data_dims[0];
+        strides_vec[3] = data_dims[1] / out_data_dims[1];
+        ksize_vec[2] = strides_vec[2];
+        ksize_vec[3] = strides_vec[3];
+      }
+    }
+
+    if (pooling_type == "max") {
+      if (global_pooling) {
+        for (auto &s : strides_vec) {
+          s = 1;
+        }
+        PADDLE_ENFORCE_LT(std::max(data_dims[0], data_dims[1]), 255,
+                          platform::errors::InvalidArgument(
+                              "MaxPoolV3Grad H, W must be less than 255 when "
+                              "global_pooling = True, but got %s",
+                              data_dims));
+        global_pooling = false;
+      }
+      const auto &runner = NpuOpRunner(
+          "MaxPoolV3Grad", {in_x_tensor, out_tensor, out_grad_tensor},
+          {in_x_grad_tensor}, {{"ksize", ksize_vec},
+                               {"strides", strides_vec},
+                               {"padding_mode", std::string("CALCULATED")},
+                               {"pads", paddings},
+                               {"data_format", data_format},
+                               {"global_pooling", global_pooling},
+                               {"ceil_mode", ceil_mode},
+                               {"exclusive", exclusive}});  // 0: floor, 1: ceil
+      runner.Run(dev_ctx.stream());
+    } else if (pooling_type == "avg") {
+      auto cpu_dev_ctx = platform::CPUDeviceContext(platform::CPUPlace());
+      Tensor cpu_in_x, cpu_out, cpu_in_x_grad, cpu_out_grad;
+      cpu_in_x.mutable_data<T>(in_x->dims(), cpu_dev_ctx.GetPlace());
+      cpu_in_x_grad.mutable_data<T>(in_x_grad->dims(), cpu_dev_ctx.GetPlace());
+      cpu_out.mutable_data<T>(out->dims(), cpu_dev_ctx.GetPlace());
+      cpu_out_grad.mutable_data<T>(out_grad->dims(), cpu_dev_ctx.GetPlace());
+
+      framework::TensorCopy(*in_x, cpu_dev_ctx.GetPlace(), dev_ctx, &cpu_in_x);
+      framework::TensorCopy(*out, cpu_dev_ctx.GetPlace(), dev_ctx, &cpu_out);
+      framework::TensorCopy(*out_grad, cpu_dev_ctx.GetPlace(), dev_ctx,
+                            &cpu_out_grad);
+      math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+      set_constant(cpu_dev_ctx, &cpu_in_x_grad, static_cast<T>(0));
+      dev_ctx.Wait();
+
+      paddle::operators::math::Pool2dGradFunctor<
+          platform::CPUDeviceContext, paddle::operators::math::AvgPoolGrad<T>,
+          T>
+          pool2d_backward;
+      paddle::operators::math::AvgPoolGrad<T> pool_process;
+      pool2d_backward(cpu_dev_ctx, cpu_in_x, cpu_out, cpu_out_grad, ksize,
+                      strides, paddings, data_format, exclusive, adaptive,
+                      &cpu_in_x_grad, pool_process);
+      framework::TensorCopy(cpu_in_x_grad, dev_ctx.GetPlace(), dev_ctx,
+                            in_x_grad);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(pool2d, ops::NPUPoolOpKernel<float>);
+REGISTER_OP_NPU_KERNEL(pool2d_grad, ops::NPUPoolGradOpKernel<float>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
new file mode 100644
index 00000000000000..b7383f9a346ca5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -0,0 +1,710 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import unittest
+import numpy as np
+sys.path.append("..")
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive
+from paddle.nn.functional import avg_pool2d, max_pool2d
+
+paddle.enable_static()
+
+
+def create_test_padding_SAME_class(parent):
+    @unittest.skipIf(not paddle.is_compiled_with_npu(),
+                     "core is not compiled with NPU")
+    class TestPaddingSMAECase(parent):
+        def init_paddings(self):
+            self.paddings = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSMAECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSMAECase
+
+
+def create_test_use_ceil_class(parent):
+    @unittest.skipIf(not paddle.is_compiled_with_npu(),
+                     "core is not compiled with NPU")
+    class TestPool2DUseCeilCase(parent):
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CeilModeCast")
+    TestPool2DUseCeilCase.__name__ = cls_name
+    globals()[cls_name] = TestPool2DUseCeilCase
+
+
+def create_test_padding_VALID_class(parent):
+    @unittest.skipIf(not paddle.is_compiled_with_npu(),
+                     "core is not compiled with NPU")
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.paddings = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+def pool2d_backward_navie(x,
+                          ksize,
+                          strides,
+                          padding_algorithm,
+                          paddings,
+                          pool_type,
+                          is_global,
+                          is_adaptive,
+                          is_exclusive,
+                          data_format="NCHW"):
+    # update paddings
+    def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape, pool_size,
+                                                        pool_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max((
+                (out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    if isinstance(padding_algorithm, str):
+        padding_algorithm = padding_algorithm.upper()
+        if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+            raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                             "It can only be 'SAME' or 'VALID'." %
+                             str(padding_algorithm))
+
+        if padding_algorithm == "VALID":
+            paddings = [0, 0, 0, 0]
+        elif padding_algorithm == "SAME":
+            input_data_shape = []
+            if data_format == "NCHW":
+                input_data_shape = x.shape[2:4]
+            elif data_format == "NHWC":
+                input_data_shape = x.shape[1:3]
+            paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
+
+    if data_format == "NHWC":
+        x = x.transpose([0, 3, 1, 2])
+
+    if is_global:
+        ksize = x.shape[2:]
+        paddings = [0, 0, 0, 0]
+
+    if is_adaptive:
+        pass
+
+    if len(paddings) == 2:
+        paddings = [paddings[0], paddings[0], paddings[1], paddings[1]]
+    x_shape = np.array(x.shape) + np.array(
+        [0, 0, paddings[0] + paddings[1], paddings[2] + paddings[3]])
+    x_new = np.zeros(x_shape)
+
+    x_old_shape = x.shape
+    x_new_shape = x_new.shape
+
+    N, C, H, W = x_old_shape
+    for n in range(N):
+        for c in range(C):
+            for h in range(H):
+                for w in range(W):
+                    x_new[n, c, h + paddings[0], w + paddings[2]] = x[n, c, h,
+                                                                      w]
+    x = x_new
+    N, C, H, W = x_new_shape
+    x_grad = np.zeros(x.shape, dtype=np.float32)
+    for n in range(N):
+        for c in range(C):
+            for h in range(0, H - ksize[0] + 1, strides[0]):
+                for w in range(0, W - ksize[1] + 1, strides[1]):
+                    start_h = h
+                    start_w = w
+                    end_h = h + ksize[0]
+                    end_w = w + ksize[1]
+                    if is_exclusive:
+                        start_h = max(start_h, paddings[0])
+                        start_w = max(start_w, paddings[2])
+                        end_h = min(end_h, H - paddings[1])
+                        end_w = min(end_w, W - paddings[3])
+                    idx = np.meshgrid(
+                        range(start_h, end_h), range(start_w, end_w))
+                    if pool_type == "max":
+                        idx = np.argmax(x[n, c, idx[0], idx[1]].flatten())
+                        idx_h = idx // (end_w - start_w)
+                        idx_w = idx % (end_w - start_w)
+                        x_grad[n, c, start_h + idx_h, start_w + idx_w] += 1
+                    elif pool_type == "avg":
+                        idx = np.meshgrid(
+                            range(start_h, end_h), range(start_w, end_w))
+                        x_grad[n, c, idx[0], idx[1]] += 1 / (
+                            end_h - start_h) / (end_w - start_w)
+
+    x_grad_new = np.zeros(x_old_shape)
+    N, C, H, W = x_old_shape
+    for n in range(N):
+        for c in range(C):
+            for h in range(H):
+                for w in range(W):
+                    x_grad_new[n, c, h, w] = x_grad[n, c, h + paddings[0], w +
+                                                    paddings[2]]
+    x_grad = x_grad_new
+    if data_format == "NHWC":
+        x_grad = x_grad.transpose([0, 2, 3, 1])
+    return x_grad
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPool2D_Op(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pool2d"
+        self.init_kernel_type()
+        self.init_data_type()
+        self.init_test_case()
+        self.padding_algorithm = "EXPLICIT"
+        self.init_paddings()
+        self.init_global_pool()
+        self.init_kernel_type()
+        self.init_pool_type()
+        self.init_ceil_mode()
+        self.init_exclusive()
+        self.init_adaptive()
+        self.init_data_format()
+        self.init_shape()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        if self.pool_type == "max":
+            input = np.array([x for x in range(np.prod(self.shape))]).reshape(
+                self.shape).astype(self.dtype)
+        output = pool2D_forward_naive(
+            input, self.ksize, self.strides, self.paddings, self.global_pool,
+            self.ceil_mode, self.exclusive, self.adaptive, self.data_format,
+            self.pool_type, self.padding_algorithm).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+            'use_cudnn': False,
+            'use_mkldnn': False,
+            'ceil_mode': self.ceil_mode,
+            'data_format': self.data_format,
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive,
+            "padding_algorithm": self.padding_algorithm,
+        }
+
+        self.outputs = {'Out': output}
+
+    def init_data_format(self):
+        self.data_format = "NCHW"
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+
+    def init_paddings(self):
+        self.paddings = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+    def init_kernel_type(self):
+        self.use_cudnn = False
+
+    def init_data_type(self):
+        self.dtype = np.float32
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
+
+    def init_ceil_mode(self):
+        self.ceil_mode = False
+
+    def init_exclusive(self):
+        self.exclusive = True
+
+    def init_adaptive(self):
+        self.adaptive = False
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(fluid.NPUPlace(0), atol=1e-3)
+
+    def test_check_grad(self):
+        x_grad = pool2d_backward_navie(
+            self.inputs['X'], self.ksize, self.strides, self.padding_algorithm,
+            self.paddings, self.pool_type, self.global_pool, self.adaptive,
+            self.exclusive, self.data_format)
+        x_grad = x_grad / np.prod(self.outputs['Out'].shape)
+        self.check_grad_with_place(
+            fluid.NPUPlace(0),
+            set(['X']),
+            'Out',
+            max_relative_error=0.06,
+            user_defined_grads=[x_grad])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1(TestPool2D_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+
+    def init_paddings(self):
+        self.paddings = [0, 0]
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase2(TestPool2D_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+
+    def init_paddings(self):
+        self.paddings = [1, 1]
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase3(TestPool2D_Op):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase4(TestCase1):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase5(TestCase2):
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAvgInclude(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+    def init_test_case(self):
+        self.ksize = [7, 7]
+        self.strides = [7, 7]
+        self.paddings = [0, 0, 0, 0]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [2, 3, 8, 8]
+
+    def init_test_case(self):
+        self.ksize = [2, 4]
+        self.strides = [2, 4]
+        self.paddings = [0, 0, 0, 0]
+
+
+#-------test pool2d with asymmetric padding-----
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPool2D_AsyPadding(TestPool2D_Op):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1_AsyPadding(TestCase1):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase2_AsyPadding(TestCase2):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 2, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase3_AsyPadding(TestCase3):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 5, 5]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase4_AsyPadding(TestCase4):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 0, 1, 0]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase5_AsyPadding((TestCase5)):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [2, 2, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAvgInclude_AsyPadding(TestCase2):
+    def init_exclusive(self):
+        self.exclusive = False
+
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 2, 1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 7, 7]
+
+
+# class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+#     def init_adaptive(self):
+#         self.adaptive = True
+
+#     def init_test_case(self):
+#         self.ksize = [2, 2]
+#         self.strides = [2, 2]
+#         self.paddings = [0, 0, 0, 0]
+
+#     def init_shape(self):
+#         self.shape = [2, 3, 8, 8]
+
+
+#----------- test channel_last --------------
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPool2D_channel_last(TestPool2D_Op):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1_channel_last(TestCase1):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase2_channel_last(TestCase2):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase3_channel_last(TestCase3):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase4_channel_last(TestCase4):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase5_channel_last(TestCase5):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase5_Max(TestCase2):
+    def init_pool_type(self):
+        self.pool_type = "max"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase5_channel_last_Max(TestCase5_Max):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAvgInclude_channel_last(TestCase2_channel_last):
+    def init_exclusive(self):
+        self.exclusive = False
+
+
+# @unittest.skipIf(not paddle.is_compiled_with_npu(),
+#                  "core is not compiled with NPU")
+# class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+#     def init_adaptive(self):
+#         self.adaptive = True
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 5, 5, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
+
+    def init_shape(self):
+        self.shape = [2, 7, 7, 3]
+
+
+# @unittest.skipIf(not paddle.is_compiled_with_npu(),
+#                  "core is not compiled with NPU")
+# class TestAvgPoolAdaptive_AsyPadding_channel_last(
+#         TestAvgPoolAdaptive_AsyPadding):
+#     def init_data_format(self):
+#         self.data_format = "NHWC"
+
+#     def init_shape(self):
+#         self.shape = [2, 7, 7, 3]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1_strides(TestCase1):
+    def init_test_case(self):
+        self.ksize = [3, 3]
+        self.strides = [1, 2]
+
+    def init_shape(self):
+        self.shape = [2, 3, 4, 5]
+
+
+create_test_padding_SAME_class(TestPool2D_Op)
+create_test_padding_SAME_class(TestCase1)
+create_test_padding_SAME_class(TestCase2)
+create_test_padding_SAME_class(TestCase3)
+create_test_padding_SAME_class(TestCase4)
+create_test_padding_SAME_class(TestCase5)
+create_test_padding_SAME_class(TestPool2D_channel_last)
+create_test_padding_SAME_class(TestCase1_channel_last)
+create_test_padding_SAME_class(TestCase2_channel_last)
+create_test_padding_SAME_class(TestCase3_channel_last)
+create_test_padding_SAME_class(TestCase4_channel_last)
+create_test_padding_SAME_class(TestCase5_channel_last)
+create_test_padding_SAME_class(TestCase1_strides)
+
+create_test_padding_VALID_class(TestPool2D_Op)
+create_test_padding_VALID_class(TestCase1)
+create_test_padding_VALID_class(TestCase2)
+create_test_padding_VALID_class(TestCase3)
+create_test_padding_VALID_class(TestCase4)
+create_test_padding_VALID_class(TestCase5)
+create_test_padding_VALID_class(TestPool2D_channel_last)
+create_test_padding_VALID_class(TestCase1_channel_last)
+create_test_padding_VALID_class(TestCase2_channel_last)
+create_test_padding_VALID_class(TestCase3_channel_last)
+create_test_padding_VALID_class(TestCase4_channel_last)
+create_test_padding_VALID_class(TestCase5_channel_last)
+
+create_test_use_ceil_class(TestCase1)
+create_test_use_ceil_class(TestCase2)
+create_test_use_ceil_class(TestCase1_AsyPadding)
+create_test_use_ceil_class(TestCase2_AsyPadding)
+create_test_use_ceil_class(TestCase1_channel_last)
+create_test_use_ceil_class(TestCase2_channel_last)
+create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
+create_test_use_ceil_class(TestCase2_AsyPadding_channel_last)
+
+if __name__ == "__main__":
+    unittest.main()

From b341e4a83437f5a211b3606f5ebff3743b30c467 Mon Sep 17 00:00:00 2001
From: ronny1996 <524019753@qq.com>
Date: Tue, 10 Aug 2021 10:27:47 +0000
Subject: [PATCH 2/4] update

---
 paddle/fluid/operators/pool_op_npu.cc         | 175 ++++++++++--------
 .../tests/unittests/npu/test_pool2d_op_npu.py | 158 ++++++----------
 2 files changed, 153 insertions(+), 180 deletions(-)

diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
index a5031401ca949c..1b52405c8b3659 100644
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -50,14 +50,16 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
     auto out_dims = out->dims();
     framework::DDim data_dims;
     framework::DDim out_data_dims;
+
     Tensor in_x_tensor, out_tensor;
     in_x_tensor.ShareDataWith(*in_x);
     out_tensor.ShareDataWith(*out);
     std::vector<int> ksize_vec(4, 1);
     std::vector<int> strides_vec(4, 1);
+
     if (channel_last) {
       data_dims = framework::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-      out_data_dims = framework::slice_ddim(out_dims, 2, out_dims.size());
+      out_data_dims = framework::slice_ddim(out_dims, 1, out_dims.size() - 1);
       ksize_vec[1] = ksize[0];
       ksize_vec[2] = ksize[1];
       strides_vec[1] = strides[0];
@@ -84,52 +86,69 @@ class NPUPoolOpKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
             ksize[1], std::max(paddings[2], paddings[3])));
+
     if (adaptive) {
-      PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], 0,
-                        platform::errors::InvalidArgument(
-                            "When adaptive = True, the H and W of input must "
-                            "be divisible by the output, "
-                            "but x dims is %s, out dims is %s",
-                            data_dims, out_data_dims));
-      PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1], 0,
-                        platform::errors::InvalidArgument(
-                            "When adaptive = True, the H and W of input must "
-                            "be divisible by the output,, "
-                            "but x dims is %s, out dims is %s",
-                            data_dims, out_data_dims));
-      if (channel_last) {
-        strides_vec[1] = data_dims[0] / out_data_dims[0];
-        strides_vec[2] = data_dims[1] / out_data_dims[1];
-        ksize_vec[1] = strides_vec[1];
-        ksize_vec[2] = strides_vec[2];
+      std::string pooling_mode = "AdaptiveAvgPool2d";
+      if (pooling_type == "max") {
+        pooling_mode = "AdaptiveMaxPool2d";
+      }
+
+      // AdaptiveAvgPool2d only support NCHW
+      Tensor transformed_input, transformed_output;
+      if (pooling_type == "avg" && channel_last) {
+        transformed_input.mutable_data<T>(
+            framework::make_dim(in_x_dims[0], in_x_dims[3], in_x_dims[1],
+                                in_x_dims[2]),
+            ctx.GetPlace());
+        transformed_output.mutable_data<T>(
+            framework::make_dim(out_dims[0], out_dims[3], out_dims[1],
+                                out_dims[2]),
+            ctx.GetPlace());
+
+        const auto &trans_runner =
+            NpuOpRunner("TransData", {in_x_tensor}, {transformed_input},
+                        {{"src_format", std::string("NHWC")},
+                         {"dst_format", std::string("NCHW")}});
+        trans_runner.Run(dev_ctx.stream());
       } else {
-        strides_vec[2] = data_dims[0] / out_data_dims[0];
-        strides_vec[3] = data_dims[1] / out_data_dims[1];
-        ksize_vec[2] = strides_vec[2];
-        ksize_vec[3] = strides_vec[3];
+        transformed_input.ShareDataWith(in_x_tensor);
+        transformed_output.ShareDataWith(out_tensor);
       }
-    }
 
-    std::string pooling_mode = "AvgPoolV2";
-    if (pooling_type == "max") {
-      PADDLE_ENFORCE_EQ(
-          exclusive, true,
-          platform::errors::InvalidArgument(
-              "MaxPool only support exclusive=false, but got true"));
-      pooling_mode = "MaxPoolV3";
+      const auto &runner = NpuOpRunner(
+          pooling_mode, {transformed_input}, {transformed_output},
+          {{"output_size", framework::vectorize<int>(out_data_dims)}});
+      runner.Run(dev_ctx.stream());
+
+      if (pooling_type == "avg" && channel_last) {
+        const auto &trans_runner =
+            NpuOpRunner("TransData", {transformed_output}, {out_tensor},
+                        {{"src_format", std::string("NCHW")},
+                         {"dst_format", std::string("NHWC")}});
+        trans_runner.Run(dev_ctx.stream());
+      }
+    } else {
+      std::string pooling_mode = "AvgPoolV2";
+      if (pooling_type == "max") {
+        PADDLE_ENFORCE_EQ(
+            exclusive, true,
+            platform::errors::InvalidArgument(
+                "MaxPool only support exclusive=false, but got true"));
+        pooling_mode = "MaxPoolV3";
+      }
+
+      const auto &runner =
+          NpuOpRunner(pooling_mode, {in_x_tensor}, {out_tensor},
+                      {{"ksize", ksize_vec},
+                       {"strides", strides_vec},
+                       {"padding_mode", std::string("CALCULATED")},
+                       {"pads", paddings},
+                       {"data_format", data_format},
+                       {"global_pooling", global_pooling},
+                       {"ceil_mode", ceil_mode},
+                       {"exclusive", exclusive}});
+      runner.Run(dev_ctx.stream());
     }
-    const auto &runner =
-        NpuOpRunner(pooling_mode, {in_x_tensor}, {out_tensor},
-                    {{"ksize", ksize_vec},
-                     {"strides", strides_vec},
-                     {"padding_mode", std::string("CALCULATED")},
-                     {"pads", paddings},
-                     {"data_format", data_format},
-                     {"global_pooling", global_pooling},
-                     {"ceil_mode", ceil_mode},
-                     {"exclusive", exclusive}});
-    auto stream = dev_ctx.stream();
-    runner.Run(stream);
   }
 };
 
@@ -164,6 +183,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
     framework::DDim out_data_dims;
     std::vector<int> ksize_vec(4, 1);
     std::vector<int> strides_vec(4, 1);
+
     Tensor in_x_tensor, out_tensor, out_grad_tensor, in_x_grad_tensor;
     in_x_tensor.ShareDataWith(*in_x);
     out_tensor.ShareDataWith(*out);
@@ -190,9 +210,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
     }
     UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm,
                   data_dims, strides, ksize);
-    if (global_pooling) {
-      adaptive = true;
-    }
+
     PADDLE_ENFORCE_LT(
         std::max(paddings[0], paddings[1]), ksize[0],
         platform::errors::InvalidArgument(
@@ -204,7 +222,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
             "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
             ksize[1], std::max(paddings[2], paddings[3])));
 
-    if (adaptive) {
+    if (adaptive || (global_pooling && pooling_type == "max")) {
       PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], 0,
                         platform::errors::InvalidArgument(
                             "When adaptive = True, H and W must be divisible, "
@@ -228,6 +246,15 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
       }
     }
 
+    NPUAttributeMap attrs = {{"ksize", ksize_vec},
+                             {"strides", strides_vec},
+                             {"padding_mode", std::string("CALCULATED")},
+                             {"pads", paddings},
+                             {"data_format", data_format},
+                             {"global_pooling", global_pooling},
+                             {"ceil_mode", ceil_mode},
+                             {"exclusive", exclusive}};
+
     if (pooling_type == "max") {
       if (global_pooling) {
         for (auto &s : strides_vec) {
@@ -235,48 +262,30 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
         }
         PADDLE_ENFORCE_LT(std::max(data_dims[0], data_dims[1]), 255,
                           platform::errors::InvalidArgument(
-                              "MaxPoolV3Grad H, W must be less than 255 when "
+                              "MaxPoolGrad H, W must be less than 255 when "
                               "global_pooling = True, but got %s",
                               data_dims));
-        global_pooling = false;
+        attrs["global_pooling"] = false;
       }
+
       const auto &runner = NpuOpRunner(
           "MaxPoolV3Grad", {in_x_tensor, out_tensor, out_grad_tensor},
-          {in_x_grad_tensor}, {{"ksize", ksize_vec},
-                               {"strides", strides_vec},
-                               {"padding_mode", std::string("CALCULATED")},
-                               {"pads", paddings},
-                               {"data_format", data_format},
-                               {"global_pooling", global_pooling},
-                               {"ceil_mode", ceil_mode},
-                               {"exclusive", exclusive}});  // 0: floor, 1: ceil
+          {in_x_grad_tensor}, attrs);  // 0: floor, 1: ceil
       runner.Run(dev_ctx.stream());
     } else if (pooling_type == "avg") {
-      auto cpu_dev_ctx = platform::CPUDeviceContext(platform::CPUPlace());
-      Tensor cpu_in_x, cpu_out, cpu_in_x_grad, cpu_out_grad;
-      cpu_in_x.mutable_data<T>(in_x->dims(), cpu_dev_ctx.GetPlace());
-      cpu_in_x_grad.mutable_data<T>(in_x_grad->dims(), cpu_dev_ctx.GetPlace());
-      cpu_out.mutable_data<T>(out->dims(), cpu_dev_ctx.GetPlace());
-      cpu_out_grad.mutable_data<T>(out_grad->dims(), cpu_dev_ctx.GetPlace());
+      PADDLE_ENFORCE(strides[0] == strides[1],
+                     platform::errors::InvalidArgument(
+                         "AvgPoolGrad dose not support Asymmetric strides. but "
+                         "strides = (%d, %d)",
+                         strides[0], strides[1]));
 
-      framework::TensorCopy(*in_x, cpu_dev_ctx.GetPlace(), dev_ctx, &cpu_in_x);
-      framework::TensorCopy(*out, cpu_dev_ctx.GetPlace(), dev_ctx, &cpu_out);
-      framework::TensorCopy(*out_grad, cpu_dev_ctx.GetPlace(), dev_ctx,
-                            &cpu_out_grad);
-      math::SetConstant<platform::CPUDeviceContext, T> set_constant;
-      set_constant(cpu_dev_ctx, &cpu_in_x_grad, static_cast<T>(0));
-      dev_ctx.Wait();
-
-      paddle::operators::math::Pool2dGradFunctor<
-          platform::CPUDeviceContext, paddle::operators::math::AvgPoolGrad<T>,
-          T>
-          pool2d_backward;
-      paddle::operators::math::AvgPoolGrad<T> pool_process;
-      pool2d_backward(cpu_dev_ctx, cpu_in_x, cpu_out, cpu_out_grad, ksize,
-                      strides, paddings, data_format, exclusive, adaptive,
-                      &cpu_in_x_grad, pool_process);
-      framework::TensorCopy(cpu_in_x_grad, dev_ctx.GetPlace(), dev_ctx,
-                            in_x_grad);
+      NpuOpRunner runner;
+      runner.SetType("AvgPoolV2Grad");
+      runner.AddInput(framework::vectorize<int>(in_x->dims()));
+      runner.AddInput(out_grad_tensor);
+      runner.AddOutput(in_x_grad_tensor);
+      runner.AddAttrs(attrs);
+      runner.Run(dev_ctx.stream());
     }
   }
 };
@@ -285,5 +294,7 @@ class NPUPoolGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(pool2d, ops::NPUPoolOpKernel<float>);
-REGISTER_OP_NPU_KERNEL(pool2d_grad, ops::NPUPoolGradOpKernel<float>);
+REGISTER_OP_NPU_KERNEL(pool2d, ops::NPUPoolOpKernel<float>,
+                       ops::NPUPoolOpKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(pool2d_grad, ops::NPUPoolGradOpKernel<float>,
+                       ops::NPUPoolGradOpKernel<plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
index b7383f9a346ca5..83ad6b4101f577 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -30,8 +30,6 @@
 
 
 def create_test_padding_SAME_class(parent):
-    @unittest.skipIf(not paddle.is_compiled_with_npu(),
-                     "core is not compiled with NPU")
     class TestPaddingSMAECase(parent):
         def init_paddings(self):
             self.paddings = [0, 0]
@@ -43,8 +41,6 @@ def init_paddings(self):
 
 
 def create_test_use_ceil_class(parent):
-    @unittest.skipIf(not paddle.is_compiled_with_npu(),
-                     "core is not compiled with NPU")
     class TestPool2DUseCeilCase(parent):
         def init_ceil_mode(self):
             self.ceil_mode = True
@@ -55,8 +51,6 @@ def init_ceil_mode(self):
 
 
 def create_test_padding_VALID_class(parent):
-    @unittest.skipIf(not paddle.is_compiled_with_npu(),
-                     "core is not compiled with NPU")
     class TestPaddingVALIDCase(parent):
         def init_paddings(self):
             self.paddings = [1, 1]
@@ -67,6 +61,20 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingVALIDCase
 
 
+def create_test_fp16_class(parent):
+    class TestFp16Case(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = False
+            self.dtype = np.float16
+
+        def test_check_grad(self):
+            return
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
+    TestFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestFp16Case
+
+
 def pool2d_backward_navie(x,
                           ksize,
                           strides,
@@ -162,6 +170,8 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                             range(start_h, end_h), range(start_w, end_w))
                         x_grad[n, c, idx[0], idx[1]] += 1 / (
                             end_h - start_h) / (end_w - start_w)
+                        if is_adaptive:
+                            x_grad[n, c, idx[0], idx[1]] /= np.prod(strides)
 
     x_grad_new = np.zeros(x_old_shape)
     N, C, H, W = x_old_shape
@@ -177,8 +187,6 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
     return x_grad
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPool2D_Op(OpTest):
     def setUp(self):
         self.set_npu()
@@ -280,8 +288,6 @@ def test_check_grad(self):
             user_defined_grads=[x_grad])
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase1(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -301,8 +307,6 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase2(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -322,39 +326,29 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase3(TestPool2D_Op):
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase4(TestCase1):
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase5(TestCase2):
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAvgInclude(TestCase2):
     def init_exclusive(self):
         self.exclusive = False
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAvgPoolAdaptive(TestCase1):
     def init_adaptive(self):
         self.adaptive = True
@@ -368,8 +362,6 @@ def init_test_case(self):
         self.paddings = [0, 0, 0, 0]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAvgPoolAdaptiveAsyOutSize(TestCase1):
     def init_adaptive(self):
         self.adaptive = True
@@ -379,15 +371,13 @@ def init_shape(self):
 
     def init_test_case(self):
         self.ksize = [2, 4]
-        self.strides = [2, 4]
+        # fixme: CANN AvgPoolGradV3 dose not support asymmetric strides
+        # self.strides = [2, 4]
+        self.strides = [4, 4]
         self.paddings = [0, 0, 0, 0]
 
 
 #-------test pool2d with asymmetric padding-----
-
-
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPool2D_AsyPadding(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -398,8 +388,6 @@ def init_shape(self):
         self.shape = [2, 3, 5, 5]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase1_AsyPadding(TestCase1):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -410,8 +398,6 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase2_AsyPadding(TestCase2):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -422,8 +408,6 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase3_AsyPadding(TestCase3):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -434,8 +418,6 @@ def init_shape(self):
         self.shape = [2, 3, 5, 5]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase4_AsyPadding(TestCase4):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -446,8 +428,6 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase5_AsyPadding((TestCase5)):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -458,8 +438,6 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAvgInclude_AsyPadding(TestCase2):
     def init_exclusive(self):
         self.exclusive = False
@@ -473,22 +451,20 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7]
 
 
-# class TestAvgPoolAdaptive_AsyPadding(TestCase1):
-#     def init_adaptive(self):
-#         self.adaptive = True
+class TestAvgPoolAdaptive_AsyPadding(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
 
-#     def init_test_case(self):
-#         self.ksize = [2, 2]
-#         self.strides = [2, 2]
-#         self.paddings = [0, 0, 0, 0]
+    def init_test_case(self):
+        self.ksize = [2, 2]
+        self.strides = [2, 2]
+        self.paddings = [1, 1, 0, 2]
 
-#     def init_shape(self):
-#         self.shape = [2, 3, 8, 8]
+    def init_shape(self):
+        self.shape = [2, 3, 8, 8]
 
 
 #----------- test channel_last --------------
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPool2D_channel_last(TestPool2D_Op):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -497,8 +473,6 @@ def init_shape(self):
         self.shape = [2, 5, 5, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase1_channel_last(TestCase1):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -507,8 +481,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase2_channel_last(TestCase2):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -517,8 +489,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase3_channel_last(TestCase3):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -527,8 +497,6 @@ def init_shape(self):
         self.shape = [2, 5, 5, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase4_channel_last(TestCase4):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -537,8 +505,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase5_channel_last(TestCase5):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -547,15 +513,11 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase5_Max(TestCase2):
     def init_pool_type(self):
         self.pool_type = "max"
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase5_channel_last_Max(TestCase5_Max):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -564,22 +526,23 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAvgInclude_channel_last(TestCase2_channel_last):
     def init_exclusive(self):
         self.exclusive = False
 
 
-# @unittest.skipIf(not paddle.is_compiled_with_npu(),
-#                  "core is not compiled with NPU")
-# class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
-#     def init_adaptive(self):
-#         self.adaptive = True
+class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_shape(self):
+        self.shape = [2, 8, 8, 3]
+
+    def init_test_case(self):
+        self.ksize = [2, 2]
+        self.strides = [2, 2]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -588,8 +551,6 @@ def init_shape(self):
         self.shape = [2, 5, 5, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase1_AsyPadding_channel_last(TestCase1_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -598,8 +559,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase2_AsyPadding_channel_last(TestCase2_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -608,8 +567,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase3_AsyPadding_channel_last(TestCase3_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -618,8 +575,6 @@ def init_shape(self):
         self.shape = [2, 5, 5, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase4_AsyPadding_channel_last(TestCase4_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -628,8 +583,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase5_AsyPadding_channel_last(TestCase5_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -638,8 +591,6 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAvgInclude_AsyPadding_channel_last(TestAvgInclude_AsyPadding):
     def init_data_format(self):
         self.data_format = "NHWC"
@@ -648,23 +599,21 @@ def init_shape(self):
         self.shape = [2, 7, 7, 3]
 
 
-# @unittest.skipIf(not paddle.is_compiled_with_npu(),
-#                  "core is not compiled with NPU")
-# class TestAvgPoolAdaptive_AsyPadding_channel_last(
-#         TestAvgPoolAdaptive_AsyPadding):
-#     def init_data_format(self):
-#         self.data_format = "NHWC"
+class TestAvgPoolAdaptive_AsyPadding_channel_last(
+        TestAvgPoolAdaptive_AsyPadding):
+    def init_data_format(self):
+        self.data_format = "NHWC"
 
-#     def init_shape(self):
-#         self.shape = [2, 7, 7, 3]
+    def init_shape(self):
+        self.shape = [2, 8, 8, 3]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase1_strides(TestCase1):
     def init_test_case(self):
         self.ksize = [3, 3]
-        self.strides = [1, 2]
+        # fixme: CANN AvgPoolGradV3 dose not support asymmetric strides
+        # self.strides = [1, 2]
+        self.strides = [2, 2]
 
     def init_shape(self):
         self.shape = [2, 3, 4, 5]
@@ -706,5 +655,18 @@ def init_shape(self):
 create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
 create_test_use_ceil_class(TestCase2_AsyPadding_channel_last)
 
+create_test_fp16_class(TestPool2D_Op)
+create_test_fp16_class(TestCase1)
+create_test_fp16_class(TestCase2)
+create_test_fp16_class(TestCase3)
+create_test_fp16_class(TestCase4)
+create_test_fp16_class(TestCase5)
+create_test_fp16_class(TestPool2D_channel_last)
+create_test_fp16_class(TestCase1_channel_last)
+create_test_fp16_class(TestCase2_channel_last)
+create_test_fp16_class(TestCase3_channel_last)
+create_test_fp16_class(TestCase4_channel_last)
+create_test_fp16_class(TestCase5_channel_last)
+
 if __name__ == "__main__":
     unittest.main()

From 78afb7bea4fd2be085b10b384ede94e87b49267a Mon Sep 17 00:00:00 2001
From: ronny1996 <524019753@qq.com>
Date: Mon, 16 Aug 2021 03:33:11 +0000
Subject: [PATCH 3/4] update pool2d_backward_navie

---
 .../tests/unittests/npu/test_pool2d_op_npu.py | 160 ++++++++++--------
 1 file changed, 87 insertions(+), 73 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
index 83ad6b4101f577..2b8550a88de592 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from op_test import OpTest
-from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive
+from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
 from paddle.nn.functional import avg_pool2d, max_pool2d
 
 paddle.enable_static()
@@ -78,13 +78,14 @@ def test_check_grad(self):
 def pool2d_backward_navie(x,
                           ksize,
                           strides,
-                          padding_algorithm,
                           paddings,
-                          pool_type,
-                          is_global,
-                          is_adaptive,
-                          is_exclusive,
-                          data_format="NCHW"):
+                          global_pool=0,
+                          ceil_mode=False,
+                          exclusive=True,
+                          adaptive=False,
+                          data_format='NCHW',
+                          pool_type="max",
+                          padding_algorithm="EXPLICIT"):
     # update paddings
     def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         padding = []
@@ -108,6 +109,11 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
 
         if padding_algorithm == "VALID":
             paddings = [0, 0, 0, 0]
+            if ceil_mode != False:
+                raise ValueError(
+                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)"
+                    " must be False. "
+                    "Received ceil_mode: True.")
         elif padding_algorithm == "SAME":
             input_data_shape = []
             if data_format == "NCHW":
@@ -116,72 +122,72 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 input_data_shape = x.shape[1:3]
             paddings = _get_padding_with_SAME(input_data_shape, ksize, strides)
 
+    assert len(paddings) == 2 or len(paddings) == 4
+    is_sys = True if len(paddings) == 2 else False
+
     if data_format == "NHWC":
         x = x.transpose([0, 3, 1, 2])
 
-    if is_global:
-        ksize = x.shape[2:]
-        paddings = [0, 0, 0, 0]
-
-    if is_adaptive:
-        pass
-
-    if len(paddings) == 2:
-        paddings = [paddings[0], paddings[0], paddings[1], paddings[1]]
-    x_shape = np.array(x.shape) + np.array(
-        [0, 0, paddings[0] + paddings[1], paddings[2] + paddings[3]])
-    x_new = np.zeros(x_shape)
-
-    x_old_shape = x.shape
-    x_new_shape = x_new.shape
-
-    N, C, H, W = x_old_shape
-    for n in range(N):
-        for c in range(C):
-            for h in range(H):
-                for w in range(W):
-                    x_new[n, c, h + paddings[0], w + paddings[2]] = x[n, c, h,
-                                                                      w]
-    x = x_new
-    N, C, H, W = x_new_shape
-    x_grad = np.zeros(x.shape, dtype=np.float32)
-    for n in range(N):
-        for c in range(C):
-            for h in range(0, H - ksize[0] + 1, strides[0]):
-                for w in range(0, W - ksize[1] + 1, strides[1]):
-                    start_h = h
-                    start_w = w
-                    end_h = h + ksize[0]
-                    end_w = w + ksize[1]
-                    if is_exclusive:
-                        start_h = max(start_h, paddings[0])
-                        start_w = max(start_w, paddings[2])
-                        end_h = min(end_h, H - paddings[1])
-                        end_w = min(end_w, W - paddings[3])
-                    idx = np.meshgrid(
-                        range(start_h, end_h), range(start_w, end_w))
-                    if pool_type == "max":
-                        idx = np.argmax(x[n, c, idx[0], idx[1]].flatten())
-                        idx_h = idx // (end_w - start_w)
-                        idx_w = idx % (end_w - start_w)
-                        x_grad[n, c, start_h + idx_h, start_w + idx_w] += 1
-                    elif pool_type == "avg":
-                        idx = np.meshgrid(
-                            range(start_h, end_h), range(start_w, end_w))
-                        x_grad[n, c, idx[0], idx[1]] += 1 / (
-                            end_h - start_h) / (end_w - start_w)
-                        if is_adaptive:
-                            x_grad[n, c, idx[0], idx[1]] /= np.prod(strides)
-
-    x_grad_new = np.zeros(x_old_shape)
-    N, C, H, W = x_old_shape
-    for n in range(N):
-        for c in range(C):
-            for h in range(H):
-                for w in range(W):
-                    x_grad_new[n, c, h, w] = x_grad[n, c, h + paddings[0], w +
-                                                    paddings[2]]
-    x_grad = x_grad_new
+    N, C, H, W = x.shape
+
+    if global_pool == 1:
+        ksize = [H, W]
+        paddings = [0 for _ in range(len(paddings))]
+
+    pad_h_up = paddings[0] if is_sys else paddings[0]
+    pad_h_down = paddings[0] if is_sys else paddings[1]
+    pad_w_left = paddings[1] if is_sys else paddings[2]
+    pad_w_right = paddings[1] if is_sys else paddings[3]
+
+    if adaptive:
+        H_out, W_out = ksize
+    else:
+        H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \
+            if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1
+        W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \
+            if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1
+
+    x_grad = np.zeros_like(x)
+    for i in range(H_out):
+        if adaptive:
+            in_h_start = adaptive_start_index(i, H, ksize[0])
+            in_h_end = adaptive_end_index(i, H, ksize[0])
+        else:
+            in_h_start = np.max((i * strides[0] - pad_h_up, 0))
+            in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H))
+
+        for j in range(W_out):
+            if adaptive:
+                in_w_start = adaptive_start_index(j, W, ksize[1])
+                in_w_end = adaptive_end_index(j, W, ksize[1])
+            else:
+                in_h_start = i * strides[0] - pad_h_up
+                in_w_start = j * strides[1] - pad_w_left
+                in_h_end = i * strides[0] + ksize[0] - pad_h_up
+                in_w_end = j * strides[1] + ksize[1] - pad_w_left
+
+                field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start)
+                in_h_start = np.max((in_h_start, 0))
+                in_w_start = np.max((in_w_start, 0))
+                in_h_end = np.min((in_h_end, H))
+                in_w_end = np.min((in_w_end, W))
+
+            if pool_type == 'avg':
+                if (exclusive or adaptive):
+                    field_size = (in_h_end - in_h_start) * (
+                        in_w_end - in_w_start)
+                x_grad[:, :, in_h_start:in_h_end, in_w_start:
+                       in_w_end] += 1 / field_size
+            elif pool_type == 'max':
+                for n in range(N):
+                    for c in range(C):
+                        idx = np.argmax(x[n, c, in_h_start:in_h_end, in_w_start:
+                                          in_w_end].flatten())
+                        idx_h = idx // (in_w_end - in_w_start)
+                        idx_w = idx % (in_w_end - in_w_start)
+                        x_grad[n, c, in_h_start + idx_h, in_w_start +
+                               idx_w] += 1
+
     if data_format == "NHWC":
         x_grad = x_grad.transpose([0, 2, 3, 1])
     return x_grad
@@ -276,9 +282,17 @@ def test_check_output(self):
 
     def test_check_grad(self):
         x_grad = pool2d_backward_navie(
-            self.inputs['X'], self.ksize, self.strides, self.padding_algorithm,
-            self.paddings, self.pool_type, self.global_pool, self.adaptive,
-            self.exclusive, self.data_format)
+            self.inputs["X"],
+            ksize=self.ksize,
+            strides=self.strides,
+            paddings=self.paddings,
+            global_pool=self.global_pool,
+            ceil_mode=False,
+            exclusive=self.exclusive,
+            adaptive=self.adaptive,
+            data_format=self.data_format,
+            pool_type=self.pool_type,
+            padding_algorithm=self.padding_algorithm)
         x_grad = x_grad / np.prod(self.outputs['Out'].shape)
         self.check_grad_with_place(
             fluid.NPUPlace(0),

From b4a8f841895e257cd00f6e293da74e426f20a382 Mon Sep 17 00:00:00 2001
From: ronny1996 <524019753@qq.com>
Date: Fri, 20 Aug 2021 09:06:53 +0000
Subject: [PATCH 4/4] clean headers

---
 paddle/fluid/operators/pool_op_npu.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
index 1b52405c8b3659..b5eb8ae61787d1 100644
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ b/paddle/fluid/operators/pool_op_npu.cc
@@ -12,12 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
-#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/fill_constant_op.h"
-
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {