Skip to content

Commit 853af66

Browse files
authored
[NPU] support cann 20.3 (#32044)
* fix compile problem on cann 20.3 * fix ut * fix test_mul * fix check_finite_and_scale * fix lookup_table_v2_grad * fix cmake * support print op
1 parent 78959a3 commit 853af66

13 files changed

Lines changed: 51 additions & 166 deletions

cmake/external/ascend.cmake

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ else()
2121
set(ASCEND_DIR /usr/local/Ascend)
2222
endif()
2323

24+
if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
25+
# It means CANN 20.2 +
26+
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
27+
endif()
28+
2429
if(WITH_ASCEND)
2530
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
2631
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
@@ -43,9 +48,7 @@ if(WITH_ASCEND)
4348
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
4449
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
4550

46-
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
47-
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
48-
endif()
51+
4952

5053
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
5154
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})

paddle/fluid/operators/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ endif()
159159

160160
if (WITH_ASCEND_CL)
161161
cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
162-
cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
163162
cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op)
164163
endif()
165164

paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
6161

6262
size_t x_size = xs.size();
6363
for (size_t i = 0; i < x_size; ++i) {
64-
found_inf_data = true;
6564
const auto* x = xs[i];
6665
auto* out = outs[i];
6766
out->mutable_data<T>(ctx.GetPlace());
@@ -77,6 +76,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
7776
NpuOpRunner("CheckNumerics", {*x}, {check_xout},
7877
{{"message", std::string("check_nan_and_inf")}});
7978
runner_checknumerics.Run(stream);
79+
ctx.template device_context<paddle::platform::NPUDeviceContext>()
80+
.Wait();
8081
} catch (platform::EnforceNotMet& exception) {
8182
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
8283
found_inf_data = true;

paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,10 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
110110
// out found_inf
111111
Tensor found_inf_tensor;
112112
found_inf_tensor.Resize({1});
113-
bool *is_finite_data =
113+
bool *found_inf_data =
114114
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
115115
f::TensorCopy(*found_inf, place, &found_inf_tensor);
116-
EXPECT_FALSE(*is_finite_data);
116+
EXPECT_TRUE(*found_inf_data);
117117

118118
ctx.Wait();
119119
}

paddle/fluid/operators/lookup_table_v2_op_npu.cc

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
2828
auto *ids_t = ctx.Input<framework::LoDTensor>("Ids"); // int tensor
2929
auto *output_t = ctx.Output<framework::LoDTensor>("Out"); // float tensor
3030
auto *table_t = ctx.Input<framework::LoDTensor>("W");
31+
32+
// It seems cann 20.1 accepts int64, but cann 20.2+ not.
33+
PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32,
34+
platform::errors::Unimplemented(
35+
"The index of LookupTableV2 should be int32."));
36+
3137
auto *table_var = ctx.InputVar("W");
3238
PADDLE_ENFORCE_EQ(
3339
table_var->IsType<framework::LoDTensor>(), true,
@@ -49,28 +55,26 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
4955
public:
5056
void Compute(const framework::ExecutionContext &ctx) const override {
5157
auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
58+
5259
auto *output_grad_t =
5360
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
5461
auto *table_grad_t =
5562
ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
56-
table_grad_t->mutable_data<T>(ctx.GetPlace());
63+
auto *p = table_grad_t->mutable_data<T>(ctx.GetPlace());
5764

5865
auto stream =
5966
ctx.template device_context<paddle::platform::NPUDeviceContext>()
6067
.stream();
6168

62-
// step2: ZerosLike x in device
63-
Tensor zeroslike_w(table_grad_t->type());
64-
zeroslike_w.Resize(table_grad_t->dims());
65-
auto p = zeroslike_w.mutable_data<T>(ctx.GetPlace());
66-
6769
platform::NPUMemsetAsync(static_cast<void *>(p), 0,
68-
zeroslike_w.numel() * sizeof(T), stream);
70+
table_grad_t->numel() * sizeof(T), stream);
6971

70-
table_grad_t->mutable_data<T>(ctx.GetPlace());
72+
// NOTE(zhiqiu): It seems in cann 20.1, the first input and output
73+
// can be different tensor, but in cann 20.2+, it does inplace operation.
74+
// Thus, the first input and output should be same tensor.
7175
auto runner_scatter =
72-
NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t},
73-
{*table_grad_t}, {});
76+
NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
77+
{*table_grad_t}, {{"use_locking", true}});
7478
runner_scatter.Run(stream);
7579
}
7680
};

paddle/fluid/operators/lookup_table_v2_op_npu_test.cc

Lines changed: 0 additions & 142 deletions
This file was deleted.

paddle/fluid/operators/tensor_formatter.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
125125
framework::LoDTensor cpu_tensor;
126126
platform::CPUPlace cpu_place;
127127
TensorCopy(print_tensor, cpu_place, &cpu_tensor);
128+
#ifdef PADDLE_WITH_ASCEND_CL
129+
if (platform::is_npu_place(print_tensor.place())) {
130+
platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
131+
}
132+
#endif
128133
data = cpu_tensor.data<T>();
129134
}
130135

paddle/fluid/platform/npu_profiler.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,17 @@ limitations under the License. */
2323
namespace paddle {
2424
namespace platform {
2525

26-
// For ACL 20.1
26+
#ifdef PADDLE_WITH_ASCEND_STRING
27+
// For CANN 20.2+
28+
// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
29+
// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
30+
// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
31+
// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
32+
// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
33+
constexpr aclprofAicoreMetrics default_metrics =
34+
ACL_AICORE_ARITHMETIC_UTILIZATION;
35+
#else
36+
// For CANN 20.1
2737
// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
2838
// ACL_AICORE_PIPELINE = 1, record pipeline
2939
// ACL_AICORE_SYNCHRONIZATION = 2, record sync
@@ -32,6 +42,7 @@ namespace platform {
3242
// ACL_AICORE_STALL = 5, record pipeline ratio
3343
constexpr aclprofAicoreMetrics default_metrics =
3444
ACL_AICORE_ARITHMATIC_THROUGHPUT;
45+
#endif
3546

3647
// ACL_PROF_ACL_API, record ACL API stats
3748
// ACL_PROF_TASK_TIME, record AI core stats

python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
import unittest
1616
import numpy as np
17+
import sys
18+
sys.path.append("..")
1719
from op_test import OpTest, skip_check_grad_ci
1820
import paddle
1921
import paddle.fluid as fluid

python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def setUp(self):
4141
vocab = 10
4242
dim = 20
4343
w = np.ones([vocab, dim]).astype(self.dtype)
44-
x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
44+
x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
4545
out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
4646

4747
self.inputs = {

0 commit comments

Comments
 (0)