-
Notifications
You must be signed in to change notification settings - Fork 6k
Profiler refine and add CUDA runtime api tracer #15301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
225d331
fc6dc7d
41c82ca
e9a625f
98a7543
a84497c
c431b6a
e069fab
31be7bf
d17f79e
bb49a5b
c2cb082
18f9158
eeb2aa7
5510f31
b59cd77
fbd3122
29c461f
dda480c
fa15b9f
6188131
02158db
a1ca223
58dea48
e46c6b9
b00bee2
058bcd8
f0c9f10
f878edf
3f29bb8
12ecf7d
03a7de2
4f39900
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ limitations under the License. */ | |
| #include "paddle/fluid/string/split.h" | ||
| #ifdef PADDLE_WITH_CUDA | ||
| #include "paddle/fluid/platform/cuda_device_guard.h" | ||
| #include "paddle/fluid/platform/dynload/cupti.h" | ||
| #endif | ||
| #include "paddle/fluid/platform/device_context.h" | ||
| #include "paddle/fluid/platform/init.h" | ||
|
|
@@ -30,6 +31,9 @@ limitations under the License. */ | |
|
|
||
| DEFINE_int32(paddle_num_threads, 1, | ||
| "Number of threads for each paddle instance."); | ||
| DEFINE_int32(multiple_of_cupti_buffer_size, 1, | ||
| "Multiple of the CUPTI device buffer size. If the timestamps have " | ||
| "been dropped when you are profiling, try increasing this value."); | ||
|
|
||
| namespace paddle { | ||
| namespace framework { | ||
|
|
@@ -78,7 +82,30 @@ void InitP2P(std::vector<int> devices) { | |
| #endif | ||
| } | ||
|
|
||
| void InitCupti() { | ||
| #ifdef PADDLE_WITH_CUPTI | ||
| if (FLAGS_multiple_of_cupti_buffer_size == 1) return; | ||
| size_t attrValue = 0, attrValueSize = sizeof(size_t); | ||
| #define MULTIPLY_ATTR_VALUE(attr) \ | ||
| { \ | ||
| PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \ | ||
| attr, &attrValueSize, &attrValue)); \ | ||
| attrValue *= FLAGS_multiple_of_cupti_buffer_size; \ | ||
| VLOG(1) << "Set " #attr " " << attrValue << " byte"; \ | ||
| PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \ | ||
| attr, &attrValueSize, &attrValue)); \ | ||
| } | ||
| MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE); | ||
| MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP); | ||
| MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE); | ||
| #undef MULTIPLY_ATTR_VALUE | ||
| #endif | ||
| } | ||
|
|
||
| void InitDevices(bool init_p2p) { | ||
| // CUPTI attribute should be set before any CUDA context is created (see CUPTI | ||
| // documentation). | ||
|
||
| InitCupti(); | ||
| /*Init all available devices by default */ | ||
| std::vector<int> devices; | ||
| #ifdef PADDLE_WITH_CUDA | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -198,11 +198,19 @@ RecordBlock::~RecordBlock() { | |
| ClearCurBlock(); | ||
| } | ||
|
|
||
| void SynchronizeAllDevice() { | ||
| int count = GetCUDADeviceCount(); | ||
| for (int i = 0; i < count; i++) { | ||
| SetDeviceId(i); | ||
| PADDLE_ENFORCE(cudaDeviceSynchronize()); | ||
| } | ||
| } | ||
|
|
||
| void EnableProfiler(ProfilerState state) { | ||
| PADDLE_ENFORCE(state != ProfilerState::kDisabled, | ||
| "Can't enable profiling, since the input state is ", | ||
| "ProfilerState::kDisabled"); | ||
|
|
||
| SynchronizeAllDevice(); | ||
| std::lock_guard<std::mutex> l(profiler_mu); | ||
| if (state == g_state) { | ||
| return; | ||
|
|
@@ -223,6 +231,7 @@ void EnableProfiler(ProfilerState state) { | |
| } | ||
|
|
||
| void ResetProfiler() { | ||
| SynchronizeAllDevice(); | ||
| GetDeviceTracer()->Reset(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you want here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
| std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); | ||
| for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); | ||
|
|
@@ -450,6 +459,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, | |
|
|
||
| void DisableProfiler(EventSortingKey sorted_key, | ||
| const std::string& profile_path) { | ||
| SynchronizeAllDevice(); | ||
| std::lock_guard<std::mutex> l(profiler_mu); | ||
| if (g_state == ProfilerState::kDisabled) return; | ||
| // Mark the profiling stop. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is just an experimental best performance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed