Skip to content

Commit 5b73783

Browse files
authored
Add flags to control whether to check Nan value of hccl_allreduce_sum. (PaddlePaddle#35093)
1 parent a8dee3b commit 5b73783

File tree

3 files changed

+14
-3
lines changed

3 files changed

+14
-3
lines changed

paddle/fluid/operators/collective/c_allreduce_op.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ limitations under the License. */
4545
#include "paddle/fluid/platform/hccl_helper.h"
4646
#endif
4747

48+
#if defined(PADDLE_WITH_ASCEND_CL)
49+
DECLARE_bool(hccl_check_nan);
50+
#endif
51+
4852
namespace paddle {
4953
namespace operators {
5054

@@ -140,6 +144,7 @@ inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
140144
try {
141145
const auto& runner_mean = paddle::operators::NpuOpRunner(
142146
"ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
147+
runner_mean.Run(stream);
143148
TensorToVector(mean, dev_ctx, &vec);
144149
} catch (...) {
145150
LOG(WARNING) << "ContainsNan catch exception";
@@ -233,9 +238,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
233238
break;
234239
}
235240
case framework::proto::VarType::FP32: {
236-
VLOG(4) << "prepare to FoundNanInf";
237-
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
238-
VLOG(4) << "check_numerics:" << found_nan;
241+
if (FLAGS_hccl_check_nan) {
242+
VLOG(3) << "prepare to FoundNanInf";
243+
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
244+
VLOG(3) << "check_numerics:" << found_nan;
245+
}
239246
break;
240247
}
241248
default:

paddle/fluid/platform/flags.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "",
9393
"This option is useful when doing multi process training and "
9494
"each process have only one device (NPU). If you want to use "
9595
"all visible devices, set this to empty string.");
96+
DEFINE_bool(hccl_check_nan, false,
97+
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
98+
"core when meets Nan value");
9699
DEFINE_string(
97100
npu_config_path, "",
98101
"The absolute path of configuration json file, like: /tmp/config.json. "

python/paddle/fluid/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ def __bootstrap__():
248248
'gpu_memory_limit_mb',
249249
'npu_config_path',
250250
'get_host_by_name_time',
251+
'hccl_check_nan',
251252
]
252253

253254
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])

0 commit comments

Comments
 (0)