Skip to content

Commit 466aefe

Browse files
committed
leave hccl_check_nan, set default to true
1 parent bf74c14 commit 466aefe

File tree

3 files changed

+13
-4
lines changed

3 files changed

+13
-4
lines changed

paddle/fluid/operators/collective/c_allreduce_op.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ limitations under the License. */
4545
#include "paddle/fluid/platform/hccl_helper.h"
4646
#endif
4747

48+
#if defined(PADDLE_WITH_ASCEND_CL)
49+
DECLARE_bool(hccl_check_nan);
50+
#endif
51+
4852
namespace paddle {
4953
namespace operators {
5054

@@ -233,10 +237,11 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
233237
break;
234238
}
235239
case framework::proto::VarType::FP32: {
236-
VLOG(4) << "prepare to FoundNanInf";
237-
// NOTE: performance relating, DO NOT REMOVE!
238-
found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
239-
VLOG(4) << "check_numerics:" << found_nan;
240+
if (FLAGS_hccl_check_nan) {
241+
VLOG(3) << "prepare to FoundNanInf";
242+
// NOTE: performance relating, DO NOT REMOVE!
243+
ContainsNan(*dev_ctx, dev_ctx->stream(), in);
244+
}
240245
break;
241246
}
242247
default:

paddle/fluid/platform/flags.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ DEFINE_string(selected_npus, "",
9393
"This option is useful when doing multi process training and "
9494
"each process have only one device (NPU). If you want to use "
9595
"all visible devices, set this to empty string.");
96+
DEFINE_bool(hccl_check_nan, true,
97+
"Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
98+
"core when meets Nan value");
9699
DEFINE_string(
97100
npu_config_path, "",
98101
"The absolute path of configuration json file, like: /tmp/config.json. "

python/paddle/fluid/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ def __bootstrap__():
248248
'gpu_memory_limit_mb',
249249
'npu_config_path',
250250
'get_host_by_name_time',
251+
'hccl_check_nan',
251252
'min_loss_scaling',
252253
]
253254

0 commit comments

Comments
 (0)