Skip to content

Commit 2b4d669

Browse files
committed
[NPU] support npu profiler (PaddlePaddle#31684)
* support npu profiler * add python api * fix bugs * add wrapper for incomplete type * update profile proto * record npu wait * add xpu placeholder
1 parent 4668f1e commit 2b4d669

9 files changed

Lines changed: 204 additions & 10 deletions

File tree

cmake/external/ascend.cmake

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,13 @@ if(WITH_ASCEND_CL)
6464

6565
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
6666
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
67-
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
67+
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
68+
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
6869

69-
message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
70+
message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
7071
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
71-
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
72+
INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
73+
INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
7274

7375
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
7476
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})

paddle/fluid/operators/expand_op_npu.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,11 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
5858
expand_times.size(), static_cast<size_t>(in_dims.size())));
5959
auto* out0 = context.Output<framework::LoDTensor>("Out");
6060
framework::DDim out_dims(in_dims);
61+
6162
for (size_t i = 0; i < expand_times.size(); ++i) {
6263
out_dims[i] *= expand_times[i];
6364
}
65+
6466
out0->Resize(out_dims);
6567
out0->mutable_data<T>(context.device_context().GetPlace());
6668
auto runner =

paddle/fluid/platform/device_context.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ limitations under the License. */
1616
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
1717
#include "paddle/fluid/platform/cuda_device_guard.h"
1818
#endif
19-
2019
#include "glog/logging.h"
20+
#include "paddle/fluid/platform/profiler.h"
2121

2222
namespace paddle {
2323
namespace memory {
@@ -254,6 +254,7 @@ NPUDeviceContext::~NPUDeviceContext() {
254254
}
255255

256256
void NPUDeviceContext::Wait() const {
257+
platform::RecordEvent record_event("NPUDeviceContext/wait");
257258
NPUDeviceGuard guard(place_.device);
258259
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
259260
}

paddle/fluid/platform/device_tracer.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer {
587587
BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
588588
} else if (platform::is_cuda_pinned_place(r.place)) {
589589
event->set_place(proto::MemEvent::CUDAPinnedPlace);
590+
} else if (platform::is_npu_place(r.place)) {
591+
event->set_place(proto::MemEvent::NPUPlace);
590592
} else {
591593
PADDLE_THROW(platform::errors::Unimplemented(
592594
"The current place is not supported."));
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <string>
18+
#include <vector>
19+
20+
#include "acl/acl_prof.h"
21+
#include "paddle/fluid/platform/enforce.h"
22+
23+
namespace paddle {
24+
namespace platform {
25+
26+
// For ACL 20.1
27+
// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
28+
// ACL_AICORE_PIPELINE = 1, record pipeline
29+
// ACL_AICORE_SYNCHRONIZATION = 2, record sync
30+
// ACL_AICORE_MEMORY = 3, recore memory
31+
// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
32+
// ACL_AICORE_STALL = 5, record pipeline ratio
33+
constexpr aclprofAicoreMetrics default_metrics =
34+
ACL_AICORE_ARITHMATIC_THROUGHPUT;
35+
36+
// ACL_PROF_ACL_API, record ACL API stats
37+
// ACL_PROF_TASK_TIME, record AI core stats
38+
// ACL_PROF_AICORE_METRICS, must include
39+
// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
40+
constexpr uint64_t default_type =
41+
ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
42+
43+
aclprofConfig *NPUProfilerCreateConfig(
44+
std::vector<uint32_t> devices = {},
45+
aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
46+
aclprofAicoreEvents *events = nullptr) {
47+
if (devices.size() == 0) {
48+
int device_id = GetCurrentNPUDeviceId();
49+
devices.emplace_back(device_id);
50+
}
51+
aclprofConfig *config =
52+
aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
53+
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
54+
"Failed to create prof config for NPU"));
55+
return config;
56+
}
57+
58+
void NPUProfilerDestroyConfig(const aclprofConfig *config) {
59+
PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
60+
}
61+
62+
void NPUProfilerInit(std::string output_path) {
63+
PADDLE_ENFORCE_NPU_SUCCESS(
64+
aclprofInit(output_path.c_str(), output_path.size()));
65+
}
66+
67+
void NPUProfilerStart(const aclprofConfig *config) {
68+
if (config == nullptr) {
69+
// NOTE(zhiqiu): support single device by default.
70+
int device_id = GetCurrentNPUDeviceId();
71+
std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
72+
config = NPUProfilerCreateConfig(devices);
73+
}
74+
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
75+
}
76+
77+
void NPUProfilerStop(const aclprofConfig *config) {
78+
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
79+
NPUProfilerDestroyConfig(config);
80+
}
81+
82+
void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
83+
84+
struct NPUProfConfigWrapper {
85+
aclprofConfig *p_;
86+
explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
87+
aclprofConfig *ptr() { return p_; }
88+
};
89+
90+
} // namespace platform
91+
} // namespace paddle

paddle/fluid/platform/profiler.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ message Event {
2121
enum EventType {
2222
CPU = 0;
2323
GPUKernel = 1;
24+
NPUKernel = 2;
2425
}
2526
optional EventType type = 8;
2627
optional string name = 1;
@@ -39,6 +40,8 @@ message MemEvent {
3940
CUDAPlace = 0;
4041
CPUPlace = 1;
4142
CUDAPinnedPlace = 2;
43+
XPUPlace = 3;
44+
NPUPlace = 4;
4245
}
4346
optional uint64 start_ns = 1;
4447
optional uint64 end_ns = 2;

paddle/fluid/pybind/pybind.cc

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ limitations under the License. */
109109

110110
#ifdef PADDLE_WITH_ASCEND_CL
111111
#include "paddle/fluid/platform/npu_info.h"
112+
#include "paddle/fluid/platform/npu_profiler.h"
112113
#endif
113114

114115
#ifdef PADDLE_WITH_XPU
@@ -581,11 +582,6 @@ PYBIND11_MODULE(core_noavx, m) {
581582
make_ddim(x_dim), make_ddim(y_dim), -1));
582583
});
583584

584-
#ifdef PADDLE_WITH_ASCEND_CL
585-
m.def("_npu_finalize",
586-
[]() { platform::AclInstance::Instance().Finalize(); });
587-
#endif
588-
589585
m.def(
590586
"_append_python_callable_object_and_return_id",
591587
[](py::object py_obj) -> size_t {
@@ -2180,6 +2176,31 @@ All parameter, weight, gradient are variables in Paddle.
21802176
#endif
21812177
#endif
21822178

2179+
#ifdef PADDLE_WITH_ASCEND_CL
2180+
m.def("get_npu_device_count", platform::GetNPUDeviceCount);
2181+
m.def("_npu_finalize", []() {
2182+
platform::AclInstance::Instance().Finalize();
2183+
}); // private interface
2184+
2185+
py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
2186+
2187+
m.def("npu_prof_init", platform::NPUProfilerInit);
2188+
m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
2189+
platform::NPUProfilerStart(c.ptr());
2190+
});
2191+
m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
2192+
platform::NPUProfilerStop(c.ptr());
2193+
});
2194+
m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
2195+
m.def("npu_prof_create_config", []() {
2196+
return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
2197+
});
2198+
2199+
m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
2200+
platform::NPUProfilerDestroyConfig(c.ptr());
2201+
});
2202+
#endif
2203+
21832204
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
21842205
.value("kDefault", platform::TracerOption::kDefault)
21852206
.value("kOpDetail", platform::TracerOption::kOpDetail)

python/paddle/fluid/profiler.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
106106
os.remove(config_file)
107107

108108

109+
@signature_safe_contextmanager
110+
def npu_profiler(output_file, config=None):
111+
"""
112+
The NPU profiler.
113+
114+
This fuctions is used to profile NPU program by NPU runtime application
115+
programming interface. The profiling result will be written into
116+
`output_file`. The users can set set the NPU profiling config by `config` argument.
117+
118+
After getting the profiling result file, users can use
119+
`tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
120+
to load this output file to visualize results.
121+
122+
Args:
123+
output_file (str) : The output file name, the result will be
124+
written into this file. It should be absolute path.
125+
config (list<str>, optional) : NPU profile config. For more details, please
126+
refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
127+
128+
Examples:
129+
130+
.. code-block:: python
131+
132+
import paddle.fluid as fluid
133+
import paddle.fluid.profiler as profiler
134+
import numpy as np
135+
136+
epoc = 8
137+
dshape = [4, 3, 28, 28]
138+
data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
139+
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
140+
141+
place = fluid.NPUPlace(0)
142+
exe = fluid.Executor(place)
143+
exe.run(fluid.default_startup_program())
144+
145+
output_file = 'npu.txt'
146+
with profiler.npu_profiler(output_file) as npu_prof:
147+
for i in range(epoc):
148+
input = np.random.random(dshape).astype('float32')
149+
exe.run(fluid.default_main_program(), feed={'data': input})
150+
# then use NPU profiler tools to load this output file
151+
# to visualize results.
152+
"""
153+
# TODO: support config in python.
154+
if not config:
155+
config = core.npu_prof_create_config()
156+
157+
core.npu_prof_init(output_file)
158+
# Enables profiler collection by the active NPU profiling tool.
159+
core.npu_prof_start(config)
160+
try:
161+
yield
162+
# Disables profiler collection.
163+
finally:
164+
core.npu_prof_stop(config)
165+
core.npu_prof_finalize()
166+
167+
109168
def reset_profiler():
110169
"""
111170
Clear the previous time record. This interface does not work for

tools/timeline.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ def _allocate_pids(self):
186186
self._chrome_trace.emit_pid(
187187
"memory usage on %s:cudapinnedplace:%d" %
188188
(k, mevent.device_id), pid)
189+
elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
190+
if (k, mevent.device_id, "NPU") not in self._mem_devices:
191+
pid = self._allocate_pid()
192+
self._mem_devices[(k, mevent.device_id, "NPU")] = pid
193+
self._chrome_trace.emit_pid(
194+
"memory usage on %s:npu:%d" % (k, mevent.device_id),
195+
pid)
189196
if (k, 0, "CPU") not in self._mem_devices:
190197
pid = self._allocate_pid()
191198
self._mem_devices[(k, 0, "CPU")] = pid
@@ -201,6 +208,11 @@ def _allocate_pids(self):
201208
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
202209
self._chrome_trace.emit_pid(
203210
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
211+
if (k, 0, "NPU") not in self._mem_devices:
212+
pid = self._allocate_pid()
213+
self._mem_devices[(k, 0, "NPU")] = pid
214+
self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
215+
(k, 0), pid)
204216

205217
def _allocate_events(self):
206218
for k, profile_pb in six.iteritems(self._profile_dict):
@@ -227,7 +239,8 @@ def _allocate_memory_event(self):
227239
place_to_str = {
228240
profiler_pb2.MemEvent.CPUPlace: "CPU",
229241
profiler_pb2.MemEvent.CUDAPlace: "GPU",
230-
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
242+
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
243+
profiler_pb2.MemEvent.NPUPlace: "NPU"
231244
}
232245
for k, profile_pb in six.iteritems(self._profile_dict):
233246
mem_list = []

0 commit comments

Comments
 (0)