Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions paddle/fluid/operators/collective/c_allreduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h"
#endif

#if defined(PADDLE_WITH_ASCEND_CL)
DECLARE_bool(hccl_check_nan);
#endif

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
try {
const auto& runner_mean = paddle::operators::NpuOpRunner(
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
runner_mean.Run(stream);
TensorToVector(mean, dev_ctx, &vec);
} catch (...) {
LOG(WARNING) << "ContainsNan catch exception";
Expand Down Expand Up @@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
break;
}
case framework::proto::VarType::FP32: {
VLOG(4) << "prepare to FoundNanInf";
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(4) << "check_numerics:" << found_nan;
if (FLAGS_hccl_check_nan) {
VLOG(3) << "prepare to FoundNanInf";
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
VLOG(3) << "check_numerics:" << found_nan;
}
break;
}
default:
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "",
"This option is useful when doing multi process training and "
"each process have only one device (NPU). If you want to use "
"all visible devices, set this to empty string.");
DEFINE_bool(hccl_check_nan, false,
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
"core when meets Nan value");
DEFINE_string(
npu_config_path, "",
"The absolute path of configuration json file, like: /tmp/config.json. "
Expand Down
1 change: 1 addition & 0 deletions python/paddle/fluid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def __bootstrap__():
'gpu_memory_limit_mb',
'npu_config_path',
'get_host_by_name_time',
'hccl_check_nan',
]

core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
Expand Down