-
Notifications
You must be signed in to change notification settings - Fork 6k
Profiler refine and add CUDA runtime api tracer #15301
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
225d331
fc6dc7d
41c82ca
e9a625f
98a7543
a84497c
c431b6a
e069fab
31be7bf
d17f79e
bb49a5b
c2cb082
18f9158
eeb2aa7
5510f31
b59cd77
fbd3122
29c461f
dda480c
fa15b9f
6188131
02158db
a1ca223
58dea48
e46c6b9
b00bee2
058bcd8
f0c9f10
f878edf
3f29bb8
12ecf7d
03a7de2
4f39900
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,25 +15,35 @@ limitations under the License. */ | |
|
|
||
| #include <deque> | ||
| #include <fstream> | ||
| #include <list> | ||
| #include <map> | ||
| #include <mutex> // NOLINT | ||
| #include <numeric> | ||
| #include <sstream> | ||
| #include <string> | ||
| #include <thread> // NOLINT | ||
| #include <utility> | ||
| #include <vector> | ||
|
|
||
| #include "glog/logging.h" | ||
| #include "google/protobuf/text_format.h" | ||
| #include "paddle/fluid/framework/block_desc.h" | ||
| #include "paddle/fluid/platform/cupti_cbid_str.h" | ||
| #include "paddle/fluid/platform/profiler.h" | ||
| #include "paddle/fluid/string/printf.h" | ||
| #ifdef __linux__ | ||
| #include <pthread.h> | ||
| #endif | ||
|
|
||
| namespace paddle { | ||
| namespace platform { | ||
| namespace { | ||
| // Tracking the nested block stacks of each thread. | ||
| thread_local std::deque<int> block_id_stack; | ||
| // Tracking the nested event stacks. | ||
| thread_local std::deque<std::string> annotation_stack; | ||
| thread_local std::deque<Event *> annotation_stack; | ||
|
|
||
| std::map<uint32_t, int32_t> system_thread_id_map; | ||
|
|
||
| std::once_flag tracer_once_flag; | ||
| DeviceTracer *tracer = nullptr; | ||
|
|
@@ -92,15 +102,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) { | |
| return "MEMCPY"; | ||
| } | ||
|
|
||
| std::string DriverKind(CUpti_CallbackId cbid) { | ||
| if (static_cast<size_t>(cbid) >= driver_cbid_str.size()) | ||
| return "Driver API " + std::to_string(cbid); | ||
| // remove the "CUPTI_DRIVER_TRACE_CBID_" prefix | ||
| return driver_cbid_str[cbid].substr(24); | ||
| } | ||
|
|
||
| std::string RuntimeKind(CUpti_CallbackId cbid) { | ||
| if (static_cast<size_t>(cbid) >= runtime_cbid_str.size()) | ||
| return "Runtime API " + std::to_string(cbid); | ||
| // remove the "CUPTI_RUNTIME_TRACE_CBID_" prefix | ||
| return runtime_cbid_str[cbid].substr(25); | ||
| } | ||
|
|
||
| void EnableActivity() { | ||
| // Device activity record is created when CUDA initializes, so we | ||
| // want to enable it before cuInit() or any CUDA runtime call. | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); | ||
| CUPTI_CALL( | ||
| dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); | ||
| CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); | ||
| // We don't track these activities for now. | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); | ||
| // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); | ||
|
|
@@ -110,7 +138,8 @@ void EnableActivity() { | |
|
|
||
| void DisableActivity() { | ||
| CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); | ||
| CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); | ||
| CUPTI_CALL( | ||
| dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); | ||
| CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); | ||
| // Disable all other activity record kinds. | ||
| CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); | ||
|
|
@@ -132,6 +161,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, | |
|
|
||
| void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, | ||
| size_t size, size_t validSize) { | ||
| static std::thread::id cupti_thread_id(0); | ||
| if (cupti_thread_id == std::thread::id(0)) | ||
| cupti_thread_id = std::this_thread::get_id(); | ||
| PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id, | ||
| "Only one thread is allowed to call bufferCompleted()"); | ||
| CUptiResult status; | ||
| CUpti_Activity *record = NULL; | ||
| if (validSize > 0) { | ||
|
|
@@ -168,6 +202,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, | |
| memcpy->correlationId, memcpy->bytes); | ||
| break; | ||
| } | ||
| case CUPTI_ACTIVITY_KIND_DRIVER: { | ||
| auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); | ||
| if (api->start != 0 && api->end != 0) | ||
| // -1 device id represents CUDA api call | ||
| tracer->AddCPURecords( | ||
| DriverKind(api->cbid), api->start, api->end, -1, | ||
| GetThreadIdFromSystemThreadId(api->threadId)); | ||
| break; | ||
| } | ||
| case CUPTI_ACTIVITY_KIND_RUNTIME: { | ||
| auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record); | ||
| if (api->start != 0 && api->end != 0) | ||
| tracer->AddCPURecords( | ||
| RuntimeKind(api->cbid), api->start, api->end, -1, | ||
| GetThreadIdFromSystemThreadId(api->threadId)); | ||
| break; | ||
| } | ||
| default: { break; } | ||
| } | ||
| } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { | ||
|
|
@@ -195,9 +246,15 @@ class DeviceTracerImpl : public DeviceTracer { | |
| public: | ||
| DeviceTracerImpl() : enabled_(false) {} | ||
|
|
||
| void AddAnnotation(uint64_t id, const std::string &anno) { | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| correlations_[id] = anno; | ||
| void AddAnnotation(uint64_t id, Event *event) { | ||
| thread_local std::list<std::pair<uint64_t, Event *>> | ||
| *local_correlations_pairs = nullptr; | ||
| if (local_correlations_pairs == nullptr) { | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| correlations_pairs.emplace_back(); | ||
| local_correlations_pairs = &correlations_pairs.back(); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. line 238~242 is not thread-safe.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? Same as above, Each thread has its own list and won't conflict with each other.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, thanks.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, fixed |
||
| local_correlations_pairs->push_back(std::make_pair(id, event)); | ||
| } | ||
|
|
||
| void AddCPURecords(const std::string &anno, uint64_t start_ns, | ||
|
|
@@ -206,8 +263,13 @@ class DeviceTracerImpl : public DeviceTracer { | |
| VLOG(1) << "Empty timeline annotation."; | ||
| return; | ||
| } | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| cpu_records_.push_back( | ||
| thread_local std::list<CPURecord> *local_cpu_records_ = nullptr; | ||
| if (local_cpu_records_ == nullptr) { | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| cpu_records_.emplace_back(); | ||
| local_cpu_records_ = &cpu_records_.back(); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The above code is not thread-safe.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? Each thread has its own list and won't conflict with each other. |
||
| local_cpu_records_->push_back( | ||
| CPURecord{anno, start_ns, end_ns, device_id, thread_id}); | ||
| } | ||
|
|
||
|
|
@@ -219,7 +281,8 @@ class DeviceTracerImpl : public DeviceTracer { | |
| VLOG(3) << name << " cannot be traced"; | ||
| return; | ||
| } | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| // NOTE(liangdun): lock is not needed, only one thread call this function. | ||
| // std::lock_guard<std::mutex> l(trace_mu_); | ||
|
||
| mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, | ||
| stream_id, correlation_id, bytes}); | ||
| } | ||
|
|
@@ -232,7 +295,8 @@ class DeviceTracerImpl : public DeviceTracer { | |
| VLOG(3) << correlation_id << " cannot be traced"; | ||
| return; | ||
| } | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| // NOTE(liangdun): lock is not needed, only one thread call this function. | ||
| // std::lock_guard<std::mutex> l(trace_mu_); | ||
|
||
| kernel_records_.push_back( | ||
| KernelRecord{name, start, end, device_id, stream_id, correlation_id}); | ||
| } | ||
|
|
@@ -266,49 +330,100 @@ class DeviceTracerImpl : public DeviceTracer { | |
| CUPTI_CALL( | ||
| dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, | ||
| CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); | ||
| CUPTI_CALL(dynload::cuptiEnableCallback( | ||
| 1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, | ||
| CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020)); | ||
| CUPTI_CALL(dynload::cuptiEnableCallback( | ||
| 1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, | ||
| CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020)); | ||
| CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); | ||
| #endif // PADDLE_WITH_CUPTI | ||
| enabled_ = true; | ||
| } | ||
|
|
||
| void Reset() { | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| kernel_records_.clear(); | ||
| mem_records_.clear(); | ||
| correlations_.clear(); | ||
| for (auto &tmp : correlations_pairs) tmp.clear(); | ||
| for (auto &tmp : cpu_records_) tmp.clear(); | ||
| } | ||
|
|
||
| void GenEventKernelCudaElapsedTime() { | ||
| #ifdef PADDLE_WITH_CUPTI | ||
| if (correlations_.empty()) | ||
| for (auto &tmp : correlations_pairs) | ||
| for (auto &pair : tmp) correlations_[pair.first] = pair.second; | ||
| for (const KernelRecord &r : kernel_records_) { | ||
| if (correlations_.find(r.correlation_id) != correlations_.end()) { | ||
| Event *e = correlations_.at(r.correlation_id); | ||
| e->AddCudaElapsedTime(r.start_ns, r.end_ns); | ||
| } | ||
| } | ||
| for (const auto &r : mem_records_) { | ||
| auto c = correlations_.find(r.correlation_id); | ||
| if (c != correlations_.end() && c->second != nullptr) { | ||
| Event *e = c->second; | ||
| e->AddCudaElapsedTime(r.start_ns, r.end_ns); | ||
| } | ||
| } | ||
| #endif | ||
| } | ||
|
|
||
| proto::Profile GenProfile(const std::string &profile_path) { | ||
| std::lock_guard<std::mutex> l(trace_mu_); | ||
| proto::Profile profile_pb; | ||
| profile_pb.set_start_ns(start_ns_); | ||
| profile_pb.set_end_ns(end_ns_); | ||
| if (correlations_.empty()) | ||
| for (auto &tmp : correlations_pairs) | ||
| for (auto &pair : tmp) correlations_[pair.first] = pair.second; | ||
| for (const KernelRecord &r : kernel_records_) { | ||
| auto *event = profile_pb.add_events(); | ||
| event->set_type(proto::Event::GPUKernel); | ||
| if (correlations_.find(r.correlation_id) != correlations_.end()) { | ||
| event->set_name(correlations_.at(r.correlation_id)); | ||
| event->set_name(correlations_.at(r.correlation_id)->name() + "::" + | ||
| r.name); | ||
|
||
| } else { | ||
| PADDLE_ENFORCE(false, "Missing Kernel Event: " + r.name); | ||
| event->set_name(r.name); | ||
| } | ||
| event->set_start_ns(r.start_ns); | ||
| event->set_end_ns(r.end_ns); | ||
| event->set_sub_device_id(r.stream_id); | ||
| event->set_device_id(r.device_id); | ||
| } | ||
|
|
||
| for (const CPURecord &r : cpu_records_) { | ||
| auto *event = profile_pb.add_events(); | ||
| event->set_type(proto::Event::CPU); | ||
| event->set_name(r.name); | ||
| event->set_start_ns(r.start_ns); | ||
| event->set_end_ns(r.end_ns); | ||
| event->set_sub_device_id(r.thread_id); | ||
| event->set_device_id(r.device_id); | ||
| } | ||
| for (auto &tmp : cpu_records_) | ||
| for (const CPURecord &r : tmp) { | ||
| auto *event = profile_pb.add_events(); | ||
| event->set_type(proto::Event::CPU); | ||
| event->set_name(r.name); | ||
| event->set_start_ns(r.start_ns); | ||
| event->set_end_ns(r.end_ns); | ||
| event->set_sub_device_id(r.thread_id); | ||
| event->set_device_id(r.device_id); | ||
| } | ||
| int miss = 0, find = 0; | ||
| for (const MemRecord &r : mem_records_) { | ||
| auto *event = profile_pb.add_events(); | ||
| event->set_type(proto::Event::GPUKernel); | ||
| auto c = correlations_.find(r.correlation_id); | ||
| if (c != correlations_.end() && c->second != nullptr) { | ||
| event->set_name(c->second->name() + "::" + r.name); | ||
|
||
| find++; | ||
| } else { | ||
| miss++; | ||
| event->set_name(r.name); | ||
| } | ||
| event->set_name(r.name); | ||
| event->set_start_ns(r.start_ns); | ||
| event->set_end_ns(r.end_ns); | ||
| event->set_sub_device_id(r.stream_id); | ||
| event->set_device_id(r.device_id); | ||
| event->mutable_memcopy()->set_bytes(r.bytes); | ||
| } | ||
| VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; | ||
| std::ofstream profile_f; | ||
| profile_f.open(profile_path, std::ios::out | std::ios::trunc); | ||
| std::string profile_str; | ||
|
|
@@ -337,15 +452,16 @@ class DeviceTracerImpl : public DeviceTracer { | |
| static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, | ||
| CUpti_CallbackId cbid, const void *cbdata) { | ||
| auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata); | ||
| DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata); | ||
| DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata); | ||
|
|
||
| if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && | ||
| (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { | ||
| if (((domain == CUPTI_CB_DOMAIN_DRIVER_API) && | ||
| (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) || | ||
| ((domain == CUPTI_CB_DOMAIN_RUNTIME_API) && | ||
| (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 || | ||
| cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020))) { | ||
| if (cbInfo->callbackSite == CUPTI_API_ENTER) { | ||
| const std::string anno = !annotation_stack.empty() | ||
| ? annotation_stack.back() | ||
| : cbInfo->symbolName; | ||
| tracer->AddAnnotation(cbInfo->correlationId, anno); | ||
| Event *event = CurAnnotation(); | ||
| tracer->AddAnnotation(cbInfo->correlationId, event); | ||
| } | ||
| } else { | ||
| VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; | ||
|
|
@@ -357,10 +473,11 @@ class DeviceTracerImpl : public DeviceTracer { | |
| bool enabled_; | ||
| uint64_t start_ns_; | ||
| uint64_t end_ns_; | ||
| std::vector<KernelRecord> kernel_records_; | ||
| std::vector<MemRecord> mem_records_; | ||
| std::vector<CPURecord> cpu_records_; | ||
| std::unordered_map<uint32_t, std::string> correlations_; | ||
| std::list<KernelRecord> kernel_records_; | ||
| std::list<MemRecord> mem_records_; | ||
| std::list<std::list<CPURecord>> cpu_records_; | ||
| std::list<std::list<std::pair<uint64_t, Event *>>> correlations_pairs; | ||
| std::unordered_map<uint64_t, Event *> correlations_; | ||
| }; | ||
|
|
||
| void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); } | ||
|
|
@@ -370,21 +487,61 @@ DeviceTracer *GetDeviceTracer() { | |
| return tracer; | ||
| } | ||
|
|
||
| void SetCurAnnotation(const std::string &anno) { | ||
| annotation_stack.push_back(anno); | ||
| } | ||
| void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); } | ||
|
|
||
| void ClearCurAnnotation() { annotation_stack.pop_back(); } | ||
|
|
||
| std::string CurAnnotation() { | ||
| if (annotation_stack.empty()) return ""; | ||
| Event *CurAnnotation() { | ||
| if (annotation_stack.empty()) return nullptr; | ||
| return annotation_stack.back(); | ||
| } | ||
| std::string CurAnnotationName() { | ||
| if (annotation_stack.empty()) return ""; | ||
| return annotation_stack.back()->name(); | ||
| } | ||
|
|
||
| void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } | ||
|
|
||
| void ClearCurBlock() { block_id_stack.pop_back(); } | ||
|
|
||
| int BlockDepth() { return block_id_stack.size(); } | ||
|
|
||
| uint32_t GetCurSystemThreadId() { | ||
| std::stringstream ss; | ||
| ss << std::this_thread::get_id(); | ||
| uint32_t id = static_cast<uint32_t>(std::stoull(ss.str())); | ||
| return id; | ||
| } | ||
|
|
||
| int64_t GetCurSystemThreadIdFromPthread() { | ||
| #ifdef __linux__ | ||
| return static_cast<int64_t>(pthread_self()); | ||
| #endif | ||
| return 0; | ||
| } | ||
|
|
||
| void SetCurThreadId(int32_t id) { | ||
|
||
| /* | ||
| NOTE(liangdun): In order to map CUDA API thread id into paddle's thread id, | ||
| we have to get current thread id. However, I can not find other elegant and | ||
| portable way to get integer thread id. One way is get thread id from | ||
| pthread. But this method only works on Linux. Another way is using | ||
| stringstream, this method is more portable. These two methods are | ||
| equivalent In Linux. | ||
| */ | ||
| auto gid = GetCurSystemThreadId(); | ||
| auto gid_p = GetCurSystemThreadIdFromPthread(); | ||
|
||
| VLOG(1) << "SetCurThreadId: " << gid << " -> " << id | ||
| << " pthread_id:" << gid_p; | ||
| system_thread_id_map[gid] = id; | ||
| } | ||
|
|
||
| int32_t GetThreadIdFromSystemThreadId(uint32_t id) { | ||
| auto it = system_thread_id_map.find(id); | ||
| if (it != system_thread_id_map.end()) return it->second; | ||
| // return origin id if no event is recorded in this thread. | ||
| return static_cast<int32_t>(id); | ||
| } | ||
|
|
||
| } // namespace platform | ||
| } // namespace paddle | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What information can be collected by using
CUPTI_ACTIVITY_KIND_DRIVERandCUPTI_ACTIVITY_KIND_RUNTIME?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA Runtime api like: cudamalloc, cudafree, launchkernel (run on cpu)