PaddlePaddle
diff --git a/‎cmake/external/xpu.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/external/xpu.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/ir/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/imperative/tracer.cc‎
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/imperative/tracer.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/imperative/tracer.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/imperative/tracer.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/inference/tensorrt/op_teller.cc‎
Lines changed: 3 additions & 2 deletions b/‎paddle/fluid/inference/tensorrt/op_teller.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/activation_op_npu.cc‎
Lines changed: 44 additions & 0 deletions b/‎paddle/fluid/operators/activation_op_npu.cc‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/collective/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/operators/collective/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/collective/c_allreduce_op.h‎
Lines changed: 36 additions & 25 deletions b/‎paddle/fluid/operators/collective/c_allreduce_op.h‎
Lines changed: 36 additions & 25 deletions
diff --git a/‎paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc‎
Lines changed: 23 additions & 14 deletions b/‎paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc‎
Lines changed: 23 additions & 14 deletions
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210804")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 
@@ -59,7 +59,7 @@ cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS grap
 
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
-pass_library(lock_free_optimize_pass base)
+pass_library(lock_free_optimize_pass base DEPS string_helper)
 pass_library(fc_fuse_pass inference)
 pass_library(map_matmul_to_mul_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 
@@ -30,6 +30,8 @@ DECLARE_string(tracer_mkldnn_ops_off);
 namespace paddle {
 namespace imperative {
 
+thread_local bool Tracer::has_grad_ = true;
+
 static std::shared_ptr<Tracer> g_current_tracer(nullptr);
 
 const std::shared_ptr<Tracer>& GetCurrentTracer() { return g_current_tracer; }
 
@@ -118,9 +118,9 @@ class Tracer {
   bool enable_program_desc_tracing_{false};
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
-  bool has_grad_{true};
   bool enable_autocast_{false};
   GarbageCollectorMap gcs_;
+  static thread_local bool has_grad_;
 };
 
 // To access static variable current_tracer
 
@@ -703,8 +703,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
       // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
-      if (desc.Input("Shape").size() >= 1 ||
-          desc.Input("ShapeTensor").size() >= 1) {
+      auto reshape_inputs = desc.Inputs();
+      if (reshape_inputs.find("Shape") != reshape_inputs.end() ||
+          reshape_inputs.find("ShapeTensor") != reshape_inputs.end()) {
         return false;
       }
       std::vector<int> shape =
 
@@ -527,6 +527,39 @@ class CosGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class AtanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AtanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {});
+    runner_dx.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -648,3 +681,14 @@ REGISTER_OP_NPU_KERNEL(
     cos_grad, ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext,
                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    atan, ops::AtanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AtanNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    atan_grad,
+    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
         DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
         DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(checknumeric SRCS checknumeric_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
         DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
     cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
 
@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 };
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-// return true if found_inf_or_nan or return false;
-template <typename T>
-bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
-                   aclrtStream stream, const paddle::framework::Tensor* in) {
-  auto& dev_ctx =
-      exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
+// return true if found_nan or return false;
+inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
+                        aclrtStream stream,
+                        const paddle::framework::Tensor* in) {
   using Tensor = paddle::framework::Tensor;
   Tensor out(in->type());
-  out.Resize(in->dims());
-  out.mutable_data<T>(dev_ctx.GetPlace());
 
-  bool found_inf_data = false;
+  Tensor mean(in->type());
+  mean.Resize({1});
+  mean.mutable_data<float>(dev_ctx.GetPlace());
+  std::vector<int> axes;
+  for (int i = 0; i < in->dims().size(); ++i) {
+    axes.push_back(i);
+  }
 
+  std::vector<float> vec;
   try {
-    const auto& runner =
-        NpuOpRunner("CheckNumerics", {*in}, {out},
-                    {{"message", std::string("check_numberics")}});
-    runner.Run(stream);
-    dev_ctx.Wait();
-  } catch (platform::EnforceNotMet& exception) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    const auto& runner_mean = paddle::operators::NpuOpRunner(
+        "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
+    TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    LOG(WARNING) << "ContainsNan catch exception";
+    return true;
+  }
+
+  VLOG(4) << "reducemeand result:" << vec[0];
+  if (std::isnan(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects nan";
+    return true;
+  }
+
+  if (std::isinf(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects inf";
   }
 
-  return found_inf_data;
+  return false;
 }
+
 #endif
 
 template <ReduceType red_type, typename T>
@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
     framework::Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
 
-    bool check_numerics = false;
+    bool found_nan = false;
 
     auto d_type = in->type();
     switch (d_type) {
-      case framework::proto::VarType::FP16:
+      case framework::proto::VarType::FP16: {
+        break;
+      }
       case framework::proto::VarType::FP32: {
         VLOG(4) << "prepare to FoundNanInf";
-        check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
-        VLOG(4) << "check_numerics:" << check_numerics;
+        found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+        VLOG(4) << "check_numerics:" << found_nan;
         break;
       }
       default:
         break;
     }
 
-    if (check_numerics) {
+    if (found_nan) {
       T inf = static_cast<T>(std::numeric_limits<float>::infinity());
       VLOG(4) << "fill input data constant inf";
       auto dims = in->dims();
 
@@ -38,6 +38,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif
 
+// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
+// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
+// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
+  std::cout << preStr << ":" << std::endl << debugstring;
   for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
+    std::cout << ele << " ";
   }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
+  std::cout << std::endl;
 }
 
 void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
   ctx.Wait();
 }
 
+template <typename T>
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
                          int iter) {
   // init
@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   int num1 = 3;
   int num2 = 128;
 
-  std::vector<float> init;
+  std::vector<T> init;
   for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
+    init.push_back(static_cast<T>(1.0 + rank_id));
   }
+  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
   PrintDebugInfo("input data", init);
 
   auto place = ctx.GetPlace();
@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
+  tensor_out->mutable_data<T>(place);  // allocate
   ctx.Wait();
 
   // run
   f::AttributeMap attrs;
   attrs["tag"] = std::string("tagx_" + std::to_string(iter));
   attrs["ring_id"] = 0;
+  attrs["use_calc_stream"] = 1;
 
   auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
                                     {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 1; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
 
-  std::vector<float> out_vec;
+  std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
 
+  float diff = static_cast<float>(out_vec[0]) - 65504;
+  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
   EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 3.0);
+  for (uint32_t i = 1; i < 10; i++) {
+    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
   }
 }
 
@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
   // only support one device, if more than one device, use first default
   PrepareUniqueId(&scope, ctx, &hccl_id);
   Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 1; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, ctx, i);
-  }
+
+  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
+  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
 }
Original file line number	Diff line number	Diff line change
`@@ -703,8 +703,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,`
`703`	`703`	`return false;`
`704`	`704`	`}`
`705`	`705`	`// Paddle-TRT does not support the input tensors: Shape and ShapeTensor`
`706`		`- if (desc.Input("Shape").size() >= 1 \|\|`
`707`		`- desc.Input("ShapeTensor").size() >= 1) {`
	`706`	`+ auto reshape_inputs = desc.Inputs();`
	`707`	`+ if (reshape_inputs.find("Shape") != reshape_inputs.end() \|\|`
	`708`	`+ reshape_inputs.find("ShapeTensor") != reshape_inputs.end()) {`
`708`	`709`	`return false;`
`709`	`710`	`}`
`710`	`711`	`std::vector<int> shape =`