From b8d1789f410595faab7df68ba08af73c984f1583 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Thu, 11 Mar 2021 13:43:27 +0800
Subject: [PATCH 1/7] [npu] support npu for kernel `transpose`

---
 paddle/fluid/operators/transpose_op_npu.cc    |  83 ++++++++++
 .../fluid/operators/transpose_op_npu_test.cc  | 149 ++++++++++++++++++
 .../unittests/npu/test_transpose_op_npu.py    |  88 +++++++++++
 3 files changed, 320 insertions(+)
 create mode 100644 paddle/fluid/operators/transpose_op_npu.cc
 create mode 100644 paddle/fluid/operators/transpose_op_npu_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
new file mode 100644
index 00000000000000..1a680097a9834d
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <iostream>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class TransposeNPUKernel : public framework::OpKernel<T> {
+    public:
+    void Compute(const framework::ExecutionContext& ctx) const override {
+        auto* x = ctx.Input<framework::LoDTensor>("X");
+        auto* out = ctx.Output<framework::LoDTensor>("Out");
+        std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+        framework::NPUAttributeMap attr_input = {{"perm", axis}};
+        auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+        auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+        runner.Run(stream);
+        //ctx.device_context().Wait();
+
+    }
+};
+
+template <typename T>
+class TransposeGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto* out_grad = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
+    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+    runner.Run(stream);
+  }
+};
+
+}
+}
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(transpose,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>
+);
+
+REGISTER_OP_NPU_KERNEL(transpose_grad,
+    ops::TransposeGradNPUKernel<float>,
+    ops::TransposeGradNPUKernel<paddle::platform::float16>,
+    ops::TransposeGradNPUKernel<int>,
+    ops::TransposeGradNPUKernel<uint8_t>,
+    ops::TransposeGradNPUKernel<int8_t>
+);
+
+
+
+#endif
+
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
new file mode 100644
index 00000000000000..25f95f5c7780f2
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <cmath>
+#include <thread>  // NOLINT
+#include <vector>
+#include <numeric>
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(transpose);
+USE_OP_DEVICE_KERNEL(transpose, NPU);
+
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+    // init
+  auto x = scope->Var("X");
+  auto out = scope->Var("Out");
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto place = ctx.GetPlace();
+
+  int dim0=2;
+  int dim1=3;
+  TensorFromVector(std::vector<T>({0,1,2,3,4,5}), ctx, x_t);
+  ctx.Wait();
+  x_t->Resize({dim0, dim1}); 
+  out_t->Resize({dim0, dim1});
+  ctx.Wait();
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  
+  f::AttributeMap attrs = {
+     {"axis", std::vector<int>({1, 0})},
+     {"data_format", std::string("AnyLayout")}
+  };
+  auto op = f::OpRegistry::CreateOp("transpose", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+  ctx.Wait();
+  op->Run(*scope, place);
+  ctx.Wait();  
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();  
+
+  EXPECT_EQ(out_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+    // init
+  auto x = scope->Var("X");
+  auto x_grad = scope->Var("X@GRAD");
+  auto out = scope->Var("Out");
+  auto out_grad = scope->Var("Out@GRAD");
+
+  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  int dim0=2;
+  int dim1=3;
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(std::vector<T>({0,1,2,3,4,5}), ctx, out_grad_t);
+  TensorFromVector(std::vector<T>({0,1,2,3,4,5}), ctx, x_t);
+  ctx.Wait();
+  x_grad_t->Resize({dim0, dim1}); 
+  x_t->Resize({dim0, dim1}); 
+  out_grad_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+
+  //out_grad_t->mutable_data<T>(place);
+  x_grad_t->mutable_data<T>(place);
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  
+  f::AttributeMap attrs = {
+     {"axis", std::vector<int>({1, 0})},
+     {"data_format", std::string("AnyLayout")}
+  };
+  /*
+     {"mkldnn_data_type", "float32"},
+     {"use_mkldnn", false},
+     {"use_quantizer", false},
+  */
+  auto op = f::OpRegistry::CreateOp("transpose_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
+                              {{"X@GRAD", {"X@GRAD"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();  
+  std::vector<T> out_v;
+  TensorToVector(*x_grad_t, ctx, &out_v);
+  ctx.Wait();  
+
+  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+
+}
+
+
+TEST(transpose, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(transpose_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
+
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
new file mode 100644
index 00000000000000..931b1942b18ef3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "transpose"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype)
+        self.out = np.transpose(self.x, [0, 2, 1, 3])
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Test grad op after it is implemented.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()

From e7f149b3c727f9f40d9a2b6980ef9bb771357835 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Fri, 12 Mar 2021 14:04:39 +0800
Subject: [PATCH 2/7] +cmake

---
 paddle/fluid/operators/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 17234edb116e3e..203a2fb0807411 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -160,6 +160,10 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op)
+if (WITH_ASCEND_CL)
+    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
+endif()
+
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)

From 38884f71f0556bc6bd2fdd04e86e31230ef05e81 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Tue, 16 Mar 2021 16:13:01 +0800
Subject: [PATCH 3/7] +add init

---
 paddle/fluid/operators/transpose_op_npu.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
index 1a680097a9834d..cef1dcee8c4eda 100644
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -29,10 +29,11 @@ class TransposeNPUKernel : public framework::OpKernel<T> {
         auto* out = ctx.Output<framework::LoDTensor>("Out");
         std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
         framework::NPUAttributeMap attr_input = {{"perm", axis}};
+        out->mutable_data<T>(ctx.device_context().GetPlace());
         auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
         auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
         runner.Run(stream);
-        //ctx.device_context().Wait();
+        //.Wait();
 
     }
 };

From 053a734047e9d8d9182b15fbe44947a788046434 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Tue, 16 Mar 2021 16:20:46 +0800
Subject: [PATCH 4/7] cleanup

---
 .../fluid/operators/transpose_op_npu_test.cc  | 38 ++++++++-----------
 .../unittests/npu/test_transpose_op_npu.py    | 23 -----------
 2 files changed, 16 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 25f95f5c7780f2..c7a791956fbe56 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -45,16 +45,15 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   auto* out_t = out->GetMutable<f::LoDTensor>();
   auto place = ctx.GetPlace();
 
-  int dim0=2;
-  int dim1=3;
-  TensorFromVector(std::vector<T>({0,1,2,3,4,5}), ctx, x_t);
+  int dim0 = 2;
+  int dim1 = 3;
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
   ctx.Wait();
-  x_t->Resize({dim0, dim1}); 
+  x_t->Resize({dim0, dim1});
   out_t->Resize({dim0, dim1});
   ctx.Wait();
   out_t->mutable_data<T>(place);
   ctx.Wait();
-  
   f::AttributeMap attrs = {
      {"axis", std::vector<int>({1, 0})},
      {"data_format", std::string("AnyLayout")}
@@ -63,10 +62,10 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
                               {{"Out", {"Out"}}}, attrs);
   ctx.Wait();
   op->Run(*scope, place);
-  ctx.Wait();  
+  ctx.Wait();
   std::vector<T> out_v;
   TensorToVector(*out_t, ctx, &out_v);
-  ctx.Wait();  
+  ctx.Wait();
 
   EXPECT_EQ(out_t->numel(), dim0 * dim1);
   EXPECT_EQ(out_v[0], 0);
@@ -90,34 +89,29 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   auto* x_t = x->GetMutable<f::LoDTensor>();
   auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
   auto* out_t = out->GetMutable<f::LoDTensor>();
-  int dim0=2;
-  int dim1=3;
+  int dim0 = 2;
+  int dim1 = 3;
   auto place = ctx.GetPlace();
 
-  TensorFromVector(std::vector<T>({0,1,2,3,4,5}), ctx, out_grad_t);
-  TensorFromVector(std::vector<T>({0,1,2,3,4,5}), ctx, x_t);
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
   ctx.Wait();
-  x_grad_t->Resize({dim0, dim1}); 
-  x_t->Resize({dim0, dim1}); 
+  x_grad_t->Resize({dim0, dim1});
+  x_t->Resize({dim0, dim1});
   out_grad_t->Resize({dim0, dim1});
   out_t->Resize({dim0, dim1});
 
-  //out_grad_t->mutable_data<T>(place);
   x_grad_t->mutable_data<T>(place);
   out_t->mutable_data<T>(place);
   ctx.Wait();
-  
   f::AttributeMap attrs = {
      {"axis", std::vector<int>({1, 0})},
      {"data_format", std::string("AnyLayout")}
   };
-  /*
-     {"mkldnn_data_type", "float32"},
-     {"use_mkldnn", false},
-     {"use_quantizer", false},
-  */
-  auto op = f::OpRegistry::CreateOp("transpose_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
-                              {{"X@GRAD", {"X@GRAD"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp(
+      "transpose_grad", 
+      {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
+      {{"X@GRAD", {"X@GRAD"}}}, attrs);
   op->Run(*scope, place);
   ctx.Wait();  
   std::vector<T> out_v;
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index 931b1942b18ef3..a3fd63a00f2fda 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -60,29 +60,6 @@ def init_axis(self):
     def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
 
-    # TODO(ascendrc): Test grad op after it is implemented.
-    # def test_check_grad_normal(self):
-    #     self.check_grad_with_place(
-    #         self.place, ['X', 'Y'],
-    #         'Out',
-    #         max_relative_error=0.006,
-    #         check_dygraph=False)
-    #
-    # def test_check_grad_ingore_x(self):
-    #     self.check_grad_with_place(
-    #         self.place, ['Y'],
-    #         'Out',
-    #         no_grad_set=set("X"),
-    #         max_relative_error=0.006,
-    #         check_dygraph=False)
-    #
-    # def test_check_grad_ingore_y(self):
-    #     self.check_grad_with_place(
-    #         self.place, ['X'],
-    #         'Out',
-    #         no_grad_set=set("Y"),
-    #         max_relative_error=0.006,check_dygraph=False)
-
 
 if __name__ == '__main__':
     unittest.main()

From fd742d74373338442e23bb41958cecbb5bd4b53f Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Wed, 17 Mar 2021 13:14:51 +0800
Subject: [PATCH 5/7] cleanup

---
 paddle/fluid/operators/transpose_op_npu.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
index cef1dcee8c4eda..2d71bfdc725108 100644
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -33,7 +33,6 @@ class TransposeNPUKernel : public framework::OpKernel<T> {
         auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
         auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
         runner.Run(stream);
-        //.Wait();
 
     }
 };

From 236335208e7501733c208db2804c9d1d4b97b519 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Wed, 17 Mar 2021 17:23:13 +0800
Subject: [PATCH 6/7] fp16 python unit test

---
 .../fluid/tests/unittests/npu/test_transpose_op_npu.py    | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index a3fd63a00f2fda..9ee296875fb9b4 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -61,5 +61,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
 
 
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOpFP16(OpTest):
+    no_need_check_grad = True 
+    def init_dtype(self):
+        self.dtype = np.float16
+
 if __name__ == '__main__':
     unittest.main()

From 8ab8275fc4e91cf067a6181d5af90744058e679e Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Wed, 17 Mar 2021 20:39:07 +0800
Subject: [PATCH 7/7] fix python unit test

---
 .../fluid/tests/unittests/npu/test_transpose_op_npu.py     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index 9ee296875fb9b4..797531a6c0f99e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -61,13 +61,14 @@ def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
 
 
-
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestTransposeOpFP16(OpTest):
-    no_need_check_grad = True 
+class TestTransposeOpFP16(TestTransposeOp):
+    no_need_check_grad = True
+
     def init_dtype(self):
         self.dtype = np.float16
 
+
 if __name__ == '__main__':
     unittest.main()