Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions paddle/fluid/operators/collective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(checknumeric SRCS checknumeric_npu_test.cc
DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
Expand Down
61 changes: 36 additions & 25 deletions paddle/fluid/operators/collective/c_allreduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
};

#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_inf_or_nan or return false;
template <typename T>
bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
aclrtStream stream, const paddle::framework::Tensor* in) {
auto& dev_ctx =
exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
// return true if found_nan or return false;
inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
aclrtStream stream,
const paddle::framework::Tensor* in) {
using Tensor = paddle::framework::Tensor;
Tensor out(in->type());
out.Resize(in->dims());
out.mutable_data<T>(dev_ctx.GetPlace());

bool found_inf_data = false;
Tensor mean(in->type());
mean.Resize({1});
mean.mutable_data<float>(dev_ctx.GetPlace());
std::vector<int> axes;
for (int i = 0; i < in->dims().size(); ++i) {
axes.push_back(i);
}

std::vector<float> vec;
try {
const auto& runner =
NpuOpRunner("CheckNumerics", {*in}, {out},
{{"message", std::string("check_numberics")}});
runner.Run(stream);
dev_ctx.Wait();
} catch (platform::EnforceNotMet& exception) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
LOG(WARNING) << "ContainsNan catch exception";
return true;
}

VLOG(4) << "reducemeand result:" << vec[0];
if (std::isnan(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects nan";
return true;
}

if (std::isinf(static_cast<float>(vec[0]))) {
LOG(WARNING) << "ContainsNan detects inf";
}

return found_inf_data;
return false;
}

#endif

template <ReduceType red_type, typename T>
Expand Down Expand Up @@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
framework::Tensor tmp;
tmp.mutable_data<float>({8}, ctx.GetPlace());

bool check_numerics = false;
bool found_nan = false;

auto d_type = in->type();
switch (d_type) {
case framework::proto::VarType::FP16:
case framework::proto::VarType::FP16: {
break;
}
case framework::proto::VarType::FP32: {
VLOG(4) << "prepare to FoundNanInf";
check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << check_numerics;
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << found_nan;
break;
}
default:
break;
}

if (check_numerics) {
if (found_nan) {
T inf = static_cast<T>(std::numeric_limits<float>::infinity());
VLOG(4) << "fill input data constant inf";
auto dims = in->dims();
Expand Down
37 changes: 23 additions & 14 deletions paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h"
#endif

// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test

namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
Expand All @@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
template <typename T>
void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
std::string debugstring = "";
std::cout << preStr << ":" << std::endl << debugstring;
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
std::cout << ele << " ";
}
VLOG(3) << preStr << ":" << std::endl << debugstring;
std::cout << std::endl;
}

void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
Expand Down Expand Up @@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
ctx.Wait();
}

template <typename T>
void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int iter) {
// init
Expand All @@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
int num1 = 3;
int num2 = 128;

std::vector<float> init;
std::vector<T> init;
for (int64_t i = 0; i < num1 * num2; ++i) {
init.push_back(1.0 + rank_id);
init.push_back(static_cast<T>(1.0 + rank_id));
}
init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
PrintDebugInfo("input data", init);

auto place = ctx.GetPlace();
Expand All @@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
tensor_out->mutable_data<T>(place); // allocate
ctx.Wait();

// run
f::AttributeMap attrs;
attrs["tag"] = std::string("tagx_" + std::to_string(iter));
attrs["ring_id"] = 0;
attrs["use_calc_stream"] = 1;

auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);

for (int i = 0; i < 10; i++) {
for (int i = 0; i < 1; i++) {
op->Run(*scope, place);
}
ctx.Wait();

std::vector<float> out_vec;
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();

PrintDebugInfo("output data", out_vec);

float diff = static_cast<float>(out_vec[0]) - 65504;
EXPECT_TRUE(diff < 0.1 && diff > -0.1);
EXPECT_EQ(out_vec.size(), init.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], 3.0);
for (uint32_t i = 1; i < 10; i++) {
EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
}
}

Expand All @@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
// only support one device, if more than one device, use first default
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
for (int i = 0; i < 1; i++) {
VLOG(2) << "iter num: " << i;
TestHCCLAllReduceOp(&scope, ctx, i);
}

TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
// TestHCCLAllReduceOp<float>(&scope, ctx, 0);
}
99 changes: 99 additions & 0 deletions paddle/fluid/operators/collective/checknumeric_npu_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifndef _WIN32
#include <unistd.h>
#endif

#include <stdio.h>
#include <cmath>
#include <string>
#include <thread> // NOLINT
#include <vector>

#include "gtest/gtest.h"

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"

#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/hccl_helper.h"
#endif

namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;

USE_OP(c_allreduce_sum);
USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
DECLARE_string(selected_npus);

template <typename T>
bool Check(T value, int size = 2 * 512 * 8192) {
f::Scope scope;
auto x = scope.Var("in");
auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
auto place = ctx.GetPlace();

auto tensor_x = x->GetMutable<f::LoDTensor>();
tensor_x->Resize({size});
tensor_x->mutable_data<T>(place); // allocate

std::vector<T> init;
for (int64_t i = 0; i < size; ++i) {
init.push_back(static_cast<T>(value));
}

TensorFromVector(init, ctx, tensor_x);
bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
return result;
}

TEST(check_numeric, NPU) {
auto inf = std::numeric_limits<float>::infinity();
auto fp16_inf = static_cast<p::float16>(inf);
auto nan = NAN;
auto fp16_nan = static_cast<p::float16>(nan);

bool result = false;
// Normal
VLOG(0) << "start normal";
result = Check<p::float16>(static_cast<p::float16>(65546));
ASSERT_FALSE(result);
Check<float>(static_cast<float>(1.0));
ASSERT_FALSE(result);

// Inf
VLOG(0) << "start inf";
result = Check<p::float16>(fp16_inf);
ASSERT_FALSE(result);
result = Check<float>(inf);
ASSERT_FALSE(result);

// Nan
VLOG(0) << "start nan";
result = Check<p::float16>(fp16_nan);
ASSERT_TRUE(result);
result = Check<float>(nan);
ASSERT_TRUE(result);
}