ascend transpose

Paddle CI_MAC · Paddle CI_MAC · commit b6b3242f8f44 · 2021-03-05T16:01:41.000+08:00
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -160,6 +160,10 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op)
+if (WITH_ASCEND_CL)
+    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
+endif()
+
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <iostream>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class TransposeNPUKernel : public framework::OpKernel<T> {
+    public:
+    void Compute(const framework::ExecutionContext& ctx) const override {
+        auto* x = ctx.Input<framework::LoDTensor>("X");
+        auto* out = ctx.Output<framework::LoDTensor>("Out");
+        std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+        framework::NPUAttributeMap attr_input = {{"perm", axis}};
+        for (auto& v: axis){
+            std::cout <<"axis" << v <<std::endl;
+        }
+        std::vector<T> vec;
+        TensorToVector(*x, ctx.device_context(), &vec);
+        for (auto& v : vec){
+          std::cout <<"x "<< v<<std::endl;
+        }
+        TensorToVector(*out, ctx.device_context(), &vec);
+        for (auto& v : vec){
+          std::cout <<"out "<< v<<std::endl;
+        }
+        auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+        auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+        runner.Run(stream);
+        //ctx.device_context().Wait();
+        TensorToVector(*out, ctx.device_context(), &vec);
+        for (auto& v : vec){
+          std::cout <<"out out"<< v<<std::endl;
+        }
+        TensorToVector(*x, ctx.device_context(), &vec);
+        for (auto& v : vec){
+          std::cout <<"out x"<< v<<std::endl;
+        }
+
+    }
+};
+
+template <typename T>
+class TransposeGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    std::cout <<" enter grad kernel "<<std::endl;
+    auto* out_grad = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (auto& v : axis){
+      std::cout <<"axis grad "<< v<<std::endl;
+    }
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    std::vector<T> vec;
+    TensorToVector(*x_grad, ctx.device_context(), &vec);
+    for (auto& v : vec){
+      std::cout <<"x grad "<< v<<std::endl;
+    }
+    TensorToVector(*out_grad, ctx.device_context(), &vec);
+    for (auto& v : vec){
+      std::cout <<"out grad "<< v<<std::endl;
+    }
+    for (auto& v : reversed_axis){
+      std::cout <<"axis "<< v<<std::endl;
+    }
+
+    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
+    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    auto stream = ctx.template device_context<paddle::platform::NPUDeviceContext>().stream();
+    runner.Run(stream);
+  }
+};
+
+}
+}
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(transpose,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>
+);
+
+REGISTER_OP_NPU_KERNEL(transpose_grad,
+    ops::TransposeGradNPUKernel<float>,
+    ops::TransposeGradNPUKernel<paddle::platform::float16>,
+    ops::TransposeGradNPUKernel<int>,
+    ops::TransposeGradNPUKernel<uint8_t>,
+    ops::TransposeGradNPUKernel<int8_t>
+);
+
+
+
+#endif
+
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <cmath>
+#include <thread>  // NOLINT
+#include <vector>
+#include <numeric>
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(transpose);
+USE_OP_DEVICE_KERNEL(transpose, NPU);
+
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+    // init
+  auto x = scope->Var("X");
+  auto out = scope->Var("Out");
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto place = ctx.GetPlace();
+
+  int dim0=2;
+  int dim1=2;
+  TensorFromVector(std::vector<T>({0,1,2,3}), ctx, x_t);
+  ctx.Wait();
+  x_t->Resize({dim0, dim1}); 
+  out_t->Resize({dim0, dim1});
+  ctx.Wait();
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  
+  f::AttributeMap attrs = {
+     {"axis", std::vector<int>({1, 0})},
+     {"data_format", std::string("AnyLayout")}
+  };
+  auto op = f::OpRegistry::CreateOp("transpose", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+  ctx.Wait();
+  op->Run(*scope, place);
+  ctx.Wait();  
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();  
+
+  EXPECT_EQ(out_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 2);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 3);
+}
+
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+    // init
+  std::cout<<"run grad test"<<std::endl;
+  auto x = scope->Var("X");
+  auto x_grad = scope->Var("X@GRAD");
+  auto out = scope->Var("Out");
+  auto out_grad = scope->Var("Out@GRAD");
+
+  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  int dim0=2;
+  int dim1=2;
+  auto place = ctx.GetPlace();
+
+  std::cout<<"build up tensor"<<std::endl;
+  TensorFromVector(std::vector<T>({0,1,2,3}), ctx, out_grad_t);
+  TensorFromVector(std::vector<T>({0,1,2,3}), ctx, x_t);
+  ctx.Wait();
+  x_grad_t->Resize({dim0, dim1}); 
+  x_t->Resize({dim0, dim1}); 
+  out_grad_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+
+  //out_grad_t->mutable_data<T>(place);
+  x_grad_t->mutable_data<T>(place);
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  
+  std::cout<<"build op"<<std::endl;
+  f::AttributeMap attrs = {
+     {"axis", std::vector<int>({1, 0})},
+     {"data_format", std::string("AnyLayout")}
+  };
+  /*
+     {"mkldnn_data_type", "float32"},
+     {"use_mkldnn", false},
+     {"use_quantizer", false},
+  */
+  auto op = f::OpRegistry::CreateOp("transpose_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"X", {"X"}}, {"Out", {"Out"}}},
+                              {{"X@GRAD", {"X@GRAD"}}}, attrs);
+  std::cout<<"run op"<<std::endl;
+  op->Run(*scope, place);
+  ctx.Wait();  
+  std::cout<<"build res"<<std::endl;
+  std::vector<T> out_v;
+  TensorToVector(*x_grad_t, ctx, &out_v);
+  ctx.Wait();  
+
+  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 2);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 3);
+}
+
+
+TEST(transpose, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
+
+TEST(transpose_grad, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx);
+}
+