Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
225d331
refine profiler && add runtime tracer
cjld Jan 12, 2019
fc6dc7d
test=develop
cjld Jan 12, 2019
41c82ca
test=develop
cjld Jan 12, 2019
e9a625f
test=develop
cjld Jan 12, 2019
98a7543
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
cjld Jan 12, 2019
a84497c
test=develop
cjld Jan 12, 2019
c431b6a
test=develop
cjld Jan 13, 2019
e069fab
test=develop
cjld Jan 13, 2019
31be7bf
test=develop
cjld Jan 14, 2019
d17f79e
test=develop
cjld Jan 15, 2019
bb49a5b
fix bug && test=develop
cjld Jan 18, 2019
c2cb082
add thread id map && test=develop
cjld Jan 20, 2019
18f9158
test=develop
cjld Jan 20, 2019
eeb2aa7
testing
cjld Feb 3, 2019
5510f31
bug fix
cjld Feb 3, 2019
b59cd77
Merge branch 'profiler_refine_tmp' into profiler_refine
cjld Feb 3, 2019
fbd3122
remove cuda event && refine code && test=develop
cjld Feb 3, 2019
29c461f
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
cjld Feb 3, 2019
dda480c
test=develop
cjld Feb 3, 2019
fa15b9f
test=develop
cjld Feb 3, 2019
6188131
test=develop
cjld Feb 3, 2019
02158db
fix windows temp file && test=develop
cjld Feb 15, 2019
a1ca223
test=develop
cjld Feb 17, 2019
58dea48
fix windows bug && test=develop
cjld Feb 17, 2019
e46c6b9
fix start up issue && test=develop
cjld Feb 17, 2019
b00bee2
code polish && test=develop
cjld Feb 18, 2019
058bcd8
remove unused code && test=develop
cjld Feb 18, 2019
f0c9f10
add some cupti cbid && test=develop
cjld Feb 18, 2019
f878edf
add FLAGS_multiple_of_cupti_buffer_size && test=develop
cjld Feb 19, 2019
3f29bb8
fix compile error && test=develop
cjld Feb 20, 2019
12ecf7d
add keyword && test=develop
cjld Feb 20, 2019
03a7de2
fix && test=develop
cjld Feb 20, 2019
4f39900
code polish && test=develop
cjld Feb 20, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
903 changes: 903 additions & 0 deletions paddle/fluid/platform/cupti_cbid_str.h

Large diffs are not rendered by default.

237 changes: 197 additions & 40 deletions paddle/fluid/platform/device_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,35 @@ limitations under the License. */

#include <deque>
#include <fstream>
#include <list>
#include <map>
#include <mutex> // NOLINT
#include <numeric>
#include <sstream>
#include <string>
#include <thread> // NOLINT
#include <utility>
#include <vector>

#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/cupti_cbid_str.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#ifdef __linux__
#include <pthread.h>
#endif

namespace paddle {
namespace platform {
namespace {
// Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks.
thread_local std::deque<std::string> annotation_stack;
thread_local std::deque<Event *> annotation_stack;

std::map<uint32_t, int32_t> system_thread_id_map;

std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr;
Expand Down Expand Up @@ -92,15 +102,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
return "MEMCPY";
}

std::string DriverKind(CUpti_CallbackId cbid) {
if (static_cast<size_t>(cbid) >= driver_cbid_str.size())
return "Driver API " + std::to_string(cbid);
// remove the "CUPTI_DRIVER_TRACE_CBID_" prefix
return driver_cbid_str[cbid].substr(24);
}

std::string RuntimeKind(CUpti_CallbackId cbid) {
if (static_cast<size_t>(cbid) >= runtime_cbid_str.size())
return "Runtime API " + std::to_string(cbid);
// remove the "CUPTI_RUNTIME_TRACE_CBID_" prefix
return runtime_cbid_str[cbid].substr(25);
}

void EnableActivity() {
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
CUPTI_CALL(
dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What information can be collected by using CUPTI_ACTIVITY_KIND_DRIVER and CUPTI_ACTIVITY_KIND_RUNTIME ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA Runtime api like: cudamalloc, cudafree, launchkernel (run on cpu)

// We don't track these activities for now.
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
Expand All @@ -110,7 +138,8 @@ void EnableActivity() {

void DisableActivity() {
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(
dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
// Disable all other activity record kinds.
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
Expand All @@ -132,6 +161,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,

void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
size_t size, size_t validSize) {
static std::thread::id cupti_thread_id(0);
if (cupti_thread_id == std::thread::id(0))
cupti_thread_id = std::this_thread::get_id();
PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
"Only one thread is allowed to call bufferCompleted()");
CUptiResult status;
CUpti_Activity *record = NULL;
if (validSize > 0) {
Expand Down Expand Up @@ -168,6 +202,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy->correlationId, memcpy->bytes);
break;
}
case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
// -1 device id represents CUDA api call
tracer->AddCPURecords(
DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
tracer->AddCPURecords(
RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
default: { break; }
}
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
Expand Down Expand Up @@ -195,9 +246,15 @@ class DeviceTracerImpl : public DeviceTracer {
public:
DeviceTracerImpl() : enabled_(false) {}

void AddAnnotation(uint64_t id, const std::string &anno) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_[id] = anno;
void AddAnnotation(uint64_t id, Event *event) {
thread_local std::list<std::pair<uint64_t, Event *>>
*local_correlations_pairs = nullptr;
if (local_correlations_pairs == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_pairs.emplace_back();
local_correlations_pairs = &correlations_pairs.back();
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 238~242 is not thread-safe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? Same as above, Each thread has its own list and won't conflict with each other.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks.
Another comment is all variables declared with the thread_local keyword have thread storage duration, so static is unnecessary in line236.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, fixed

local_correlations_pairs->push_back(std::make_pair(id, event));
}

void AddCPURecords(const std::string &anno, uint64_t start_ns,
Expand All @@ -206,8 +263,13 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Empty timeline annotation.";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
thread_local std::list<CPURecord> *local_cpu_records_ = nullptr;
if (local_cpu_records_ == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.emplace_back();
local_cpu_records_ = &cpu_records_.back();
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The above code is not thread-safe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? Each thread has its own list and won't conflict with each other.

local_cpu_records_->push_back(
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
}

Expand All @@ -219,7 +281,8 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(3) << name << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
// NOTE(liangdun): lock is not needed, only one thread call this function.
// std::lock_guard<std::mutex> l(trace_mu_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 302 can be removed, because it is an unnecessary code,

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}
Expand All @@ -232,7 +295,8 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(3) << correlation_id << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
// NOTE(liangdun): lock is not needed, only one thread call this function.
// std::lock_guard<std::mutex> l(trace_mu_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure bufferCompleted is executed by single thread?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, pretty sure, see the PADDLE_ENFORCE I added in the latest commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line 317 should be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

kernel_records_.push_back(
KernelRecord{name, start, end, device_id, stream_id, correlation_id});
}
Expand Down Expand Up @@ -266,49 +330,100 @@ class DeviceTracerImpl : public DeviceTracer {
CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_CALL(dynload::cuptiEnableCallback(
1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020));
CUPTI_CALL(dynload::cuptiEnableCallback(
1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = true;
}

void Reset() {
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.clear();
mem_records_.clear();
correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear();
}

void GenEventKernelCudaElapsedTime() {
#ifdef PADDLE_WITH_CUPTI
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) != correlations_.end()) {
Event *e = correlations_.at(r.correlation_id);
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
for (const auto &r : mem_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
#endif
}

proto::Profile GenProfile(const std::string &profile_path) {
std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
if (correlations_.find(r.correlation_id) != correlations_.end()) {
event->set_name(correlations_.at(r.correlation_id));
event->set_name(correlations_.at(r.correlation_id)->name() + "::" +
r.name);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two map lookups, i.g. correlations_.find and correlations_.at here, it can be reduced to one time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

} else {
PADDLE_ENFORCE(false, "Missing Kernel Event: " + r.name);
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
}

for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
for (auto &tmp : cpu_records_)
for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
int miss = 0, find = 0;
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name() + "::" + r.name);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we should set the event name with c->second->name(), and use another another field to record r.name, for example, detail_info.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed, add detail_info

find++;
} else {
miss++;
event->set_name(r.name);
}
event->set_name(r.name);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc);
std::string profile_str;
Expand Down Expand Up @@ -337,15 +452,16 @@ class DeviceTracerImpl : public DeviceTracer {
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);

if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
if (((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) ||
((domain == CUPTI_CB_DOMAIN_RUNTIME_API) &&
(cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020))) {
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
const std::string anno = !annotation_stack.empty()
? annotation_stack.back()
: cbInfo->symbolName;
tracer->AddAnnotation(cbInfo->correlationId, anno);
Event *event = CurAnnotation();
tracer->AddAnnotation(cbInfo->correlationId, event);
}
} else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
Expand All @@ -357,10 +473,11 @@ class DeviceTracerImpl : public DeviceTracer {
bool enabled_;
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
std::list<KernelRecord> kernel_records_;
std::list<MemRecord> mem_records_;
std::list<std::list<CPURecord>> cpu_records_;
std::list<std::list<std::pair<uint64_t, Event *>>> correlations_pairs;
std::unordered_map<uint64_t, Event *> correlations_;
};

void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
Expand All @@ -370,21 +487,61 @@ DeviceTracer *GetDeviceTracer() {
return tracer;
}

void SetCurAnnotation(const std::string &anno) {
annotation_stack.push_back(anno);
}
void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }

void ClearCurAnnotation() { annotation_stack.pop_back(); }

std::string CurAnnotation() {
if (annotation_stack.empty()) return "";
Event *CurAnnotation() {
if (annotation_stack.empty()) return nullptr;
return annotation_stack.back();
}
std::string CurAnnotationName() {
if (annotation_stack.empty()) return "";
return annotation_stack.back()->name();
}

void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }

void ClearCurBlock() { block_id_stack.pop_back(); }

int BlockDepth() { return block_id_stack.size(); }

uint32_t GetCurSystemThreadId() {
std::stringstream ss;
ss << std::this_thread::get_id();
uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
return id;
}

int64_t GetCurSystemThreadIdFromPthread() {
#ifdef __linux__
return static_cast<int64_t>(pthread_self());
#endif
return 0;
}

void SetCurThreadId(int32_t id) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SetCurThreadId is confusing, it seems that you want to reset the current threadID.
Please change a name, i.e. RecoreCurThreadId.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

/*
NOTE(liangdun): In order to map CUDA API thread id into paddle's thread id,
we have to get current thread id. However, I can not find other elegant and
portable way to get integer thread id. One way is get thread id from
pthread. But this method only works on Linux. Another way is using
stringstream, this method is more portable. These two methods are
equivalent In Linux.
*/
auto gid = GetCurSystemThreadId();
auto gid_p = GetCurSystemThreadIdFromPthread();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use gid here, and GetCurSystemThreadIdFromPthread should be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed

VLOG(1) << "SetCurThreadId: " << gid << " -> " << id
<< " pthread_id:" << gid_p;
system_thread_id_map[gid] = id;
}

int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
auto it = system_thread_id_map.find(id);
if (it != system_thread_id_map.end()) return it->second;
// return origin id if no event is recorded in this thread.
return static_cast<int32_t>(id);
}

} // namespace platform
} // namespace paddle
Loading