PaddlePaddle · cjld · Feb 21, 2019 · Jan 12, 2019 · Jan 12, 2019 · Jan 12, 2019
diff --git a/paddle/fluid/platform/cupti_cbid_str.h b/paddle/fluid/platform/cupti_cbid_str.h
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
@@ -15,25 +15,35 @@ limitations under the License. */
 
 #include <deque>
 #include <fstream>
+#include <list>
 #include <map>
 #include <mutex>  // NOLINT
 #include <numeric>
+#include <sstream>
 #include <string>
 #include <thread>  // NOLINT
+#include <utility>
 #include <vector>
 
 #include "glog/logging.h"
 #include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/platform/cupti_cbid_str.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
+#ifdef __linux__
+#include <pthread.h>
+#endif
 
 namespace paddle {
 namespace platform {
 namespace {
 // Tracking the nested block stacks of each thread.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
-thread_local std::deque<std::string> annotation_stack;
+thread_local std::deque<Event *> annotation_stack;
+
+std::map<uint32_t, int32_t> system_thread_id_map;
 
 std::once_flag tracer_once_flag;
 DeviceTracer *tracer = nullptr;
@@ -92,15 +102,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
   return "MEMCPY";
 }
 
+std::string DriverKind(CUpti_CallbackId cbid) {
+  if (static_cast<size_t>(cbid) >= driver_cbid_str.size())
+    return "Driver API " + std::to_string(cbid);
+  // remove the "CUPTI_DRIVER_TRACE_CBID_" prefix
+  return driver_cbid_str[cbid].substr(24);
+}
+
+std::string RuntimeKind(CUpti_CallbackId cbid) {
+  if (static_cast<size_t>(cbid) >= runtime_cbid_str.size())
+    return "Runtime API " + std::to_string(cbid);
+  // remove the "CUPTI_RUNTIME_TRACE_CBID_" prefix
+  return runtime_cbid_str[cbid].substr(25);
+}
+
 void EnableActivity() {
   // Device activity record is created when CUDA initializes, so we
   // want to enable it before cuInit() or any CUDA runtime call.
   CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
-  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  CUPTI_CALL(
+      dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
+  CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
   // We don't track these activities for now.
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
+  // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
   // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
@@ -110,7 +138,8 @@ void EnableActivity() {
 
 void DisableActivity() {
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
-  CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
+  CUPTI_CALL(
+      dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
   // Disable all other activity record kinds.
   CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
@@ -132,6 +161,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
 
 void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                               size_t size, size_t validSize) {
+  static std::thread::id cupti_thread_id(0);
+  if (cupti_thread_id == std::thread::id(0))
+    cupti_thread_id = std::this_thread::get_id();
+  PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
+                    "Only one thread is allowed to call bufferCompleted()");
   CUptiResult status;
   CUpti_Activity *record = NULL;
   if (validSize > 0) {
@@ -168,6 +202,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                 memcpy->correlationId, memcpy->bytes);
             break;
           }
+          case CUPTI_ACTIVITY_KIND_DRIVER: {
+            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
+            if (api->start != 0 && api->end != 0)
+              // -1 device id represents CUDA api call
+              tracer->AddCPURecords(
+                  DriverKind(api->cbid), api->start, api->end, -1,
+                  GetThreadIdFromSystemThreadId(api->threadId));
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_RUNTIME: {
+            auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
+            if (api->start != 0 && api->end != 0)
+              tracer->AddCPURecords(
+                  RuntimeKind(api->cbid), api->start, api->end, -1,
+                  GetThreadIdFromSystemThreadId(api->threadId));
+            break;
+          }
           default: { break; }
         }
       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -195,9 +246,15 @@ class DeviceTracerImpl : public DeviceTracer {
  public:
   DeviceTracerImpl() : enabled_(false) {}
 
-  void AddAnnotation(uint64_t id, const std::string &anno) {
-    std::lock_guard<std::mutex> l(trace_mu_);
-    correlations_[id] = anno;
+  void AddAnnotation(uint64_t id, Event *event) {
+    thread_local std::list<std::pair<uint64_t, Event *>>
+        *local_correlations_pairs = nullptr;
+    if (local_correlations_pairs == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      correlations_pairs.emplace_back();
+      local_correlations_pairs = &correlations_pairs.back();
+    }
+    local_correlations_pairs->push_back(std::make_pair(id, event));
   }
 
   void AddCPURecords(const std::string &anno, uint64_t start_ns,
@@ -206,8 +263,13 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
-    cpu_records_.push_back(
+    thread_local std::list<CPURecord> *local_cpu_records_ = nullptr;
+    if (local_cpu_records_ == nullptr) {
+      std::lock_guard<std::mutex> l(trace_mu_);
+      cpu_records_.emplace_back();
+      local_cpu_records_ = &cpu_records_.back();
+    }
+    local_cpu_records_->push_back(
         CPURecord{anno, start_ns, end_ns, device_id, thread_id});
   }
 
@@ -219,7 +281,8 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(3) << name << " cannot be traced";
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
+    // NOTE(liangdun): lock is not needed, only one thread call this function.
+    // std::lock_guard<std::mutex> l(trace_mu_);
     mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
                                      stream_id, correlation_id, bytes});
   }
@@ -232,7 +295,8 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(3) << correlation_id << " cannot be traced";
       return;
     }
-    std::lock_guard<std::mutex> l(trace_mu_);
+    // NOTE(liangdun): lock is not needed, only one thread call this function.
+    // std::lock_guard<std::mutex> l(trace_mu_);
     kernel_records_.push_back(
         KernelRecord{name, start, end, device_id, stream_id, correlation_id});
   }
@@ -266,49 +330,100 @@ class DeviceTracerImpl : public DeviceTracer {
     CUPTI_CALL(
         dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
+    CUPTI_CALL(dynload::cuptiEnableCallback(
+        1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API,
+        CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020));
+    CUPTI_CALL(dynload::cuptiEnableCallback(
+        1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API,
+        CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020));
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
 #endif  // PADDLE_WITH_CUPTI
     enabled_ = true;
   }
 
+  void Reset() {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    kernel_records_.clear();
+    mem_records_.clear();
+    correlations_.clear();
+    for (auto &tmp : correlations_pairs) tmp.clear();
+    for (auto &tmp : cpu_records_) tmp.clear();
+  }
+
+  void GenEventKernelCudaElapsedTime() {
+#ifdef PADDLE_WITH_CUPTI
+    if (correlations_.empty())
+      for (auto &tmp : correlations_pairs)
+        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
+    for (const KernelRecord &r : kernel_records_) {
+      if (correlations_.find(r.correlation_id) != correlations_.end()) {
+        Event *e = correlations_.at(r.correlation_id);
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+      }
+    }
+    for (const auto &r : mem_records_) {
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        Event *e = c->second;
+        e->AddCudaElapsedTime(r.start_ns, r.end_ns);
+      }
+    }
+#endif
+  }
+
   proto::Profile GenProfile(const std::string &profile_path) {
     std::lock_guard<std::mutex> l(trace_mu_);
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
+    if (correlations_.empty())
+      for (auto &tmp : correlations_pairs)
+        for (auto &pair : tmp) correlations_[pair.first] = pair.second;
     for (const KernelRecord &r : kernel_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
       if (correlations_.find(r.correlation_id) != correlations_.end()) {
-        event->set_name(correlations_.at(r.correlation_id));
+        event->set_name(correlations_.at(r.correlation_id)->name() + "::" +
+                        r.name);
       } else {
+        PADDLE_ENFORCE(false, "Missing Kernel Event: " + r.name);
         event->set_name(r.name);
       }
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
     }
-
-    for (const CPURecord &r : cpu_records_) {
-      auto *event = profile_pb.add_events();
-      event->set_type(proto::Event::CPU);
-      event->set_name(r.name);
-      event->set_start_ns(r.start_ns);
-      event->set_end_ns(r.end_ns);
-      event->set_sub_device_id(r.thread_id);
-      event->set_device_id(r.device_id);
-    }
+    for (auto &tmp : cpu_records_)
+      for (const CPURecord &r : tmp) {
+        auto *event = profile_pb.add_events();
+        event->set_type(proto::Event::CPU);
+        event->set_name(r.name);
+        event->set_start_ns(r.start_ns);
+        event->set_end_ns(r.end_ns);
+        event->set_sub_device_id(r.thread_id);
+        event->set_device_id(r.device_id);
+      }
+    int miss = 0, find = 0;
     for (const MemRecord &r : mem_records_) {
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
+      auto c = correlations_.find(r.correlation_id);
+      if (c != correlations_.end() && c->second != nullptr) {
+        event->set_name(c->second->name() + "::" + r.name);
+        find++;
+      } else {
+        miss++;
+        event->set_name(r.name);
+      }
       event->set_name(r.name);
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
       event->mutable_memcopy()->set_bytes(r.bytes);
     }
+    VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
     std::ofstream profile_f;
     profile_f.open(profile_path, std::ios::out | std::ios::trunc);
     std::string profile_str;
@@ -337,15 +452,16 @@ class DeviceTracerImpl : public DeviceTracer {
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata) {
     auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-    DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
+    DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
 
-    if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
-        (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
+    if (((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
+         (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) ||
+        ((domain == CUPTI_CB_DOMAIN_RUNTIME_API) &&
+         (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 ||
+          cbid == CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020))) {
       if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-        const std::string anno = !annotation_stack.empty()
-                                     ? annotation_stack.back()
-                                     : cbInfo->symbolName;
-        tracer->AddAnnotation(cbInfo->correlationId, anno);
+        Event *event = CurAnnotation();
+        tracer->AddAnnotation(cbInfo->correlationId, event);
       }
     } else {
       VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
@@ -357,10 +473,11 @@ class DeviceTracerImpl : public DeviceTracer {
   bool enabled_;
   uint64_t start_ns_;
   uint64_t end_ns_;
-  std::vector<KernelRecord> kernel_records_;
-  std::vector<MemRecord> mem_records_;
-  std::vector<CPURecord> cpu_records_;
-  std::unordered_map<uint32_t, std::string> correlations_;
+  std::list<KernelRecord> kernel_records_;
+  std::list<MemRecord> mem_records_;
+  std::list<std::list<CPURecord>> cpu_records_;
+  std::list<std::list<std::pair<uint64_t, Event *>>> correlations_pairs;
+  std::unordered_map<uint64_t, Event *> correlations_;
 };
 
 void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
@@ -370,21 +487,61 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(const std::string &anno) {
-  annotation_stack.push_back(anno);
-}
+void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
 
 void ClearCurAnnotation() { annotation_stack.pop_back(); }
 
-std::string CurAnnotation() {
-  if (annotation_stack.empty()) return "";
+Event *CurAnnotation() {
+  if (annotation_stack.empty()) return nullptr;
   return annotation_stack.back();
 }
+std::string CurAnnotationName() {
+  if (annotation_stack.empty()) return "";
+  return annotation_stack.back()->name();
+}
 
 void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
 
 void ClearCurBlock() { block_id_stack.pop_back(); }
 
 int BlockDepth() { return block_id_stack.size(); }
+
+uint32_t GetCurSystemThreadId() {
+  std::stringstream ss;
+  ss << std::this_thread::get_id();
+  uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
+  return id;
+}
+
+int64_t GetCurSystemThreadIdFromPthread() {
+#ifdef __linux__
+  return static_cast<int64_t>(pthread_self());
+#endif
+  return 0;
+}
+
+void SetCurThreadId(int32_t id) {
+  /*
+    NOTE(liangdun): In order to map CUDA API thread id into paddle's thread id,
+    we have to get current thread id. However, I can not find other elegant and
+    portable way to get integer thread id. One way is get thread id from
+    pthread. But this method only works on Linux. Another way is using
+    stringstream, this method is more portable. These two methods are
+    equivalent In Linux.
+  */
+  auto gid = GetCurSystemThreadId();
+  auto gid_p = GetCurSystemThreadIdFromPthread();
+  VLOG(1) << "SetCurThreadId: " << gid << " -> " << id
+          << " pthread_id:" << gid_p;
+  system_thread_id_map[gid] = id;
+}
+
+int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
+  auto it = system_thread_id_map.find(id);
+  if (it != system_thread_id_map.end()) return it->second;
+  // return origin id if no event is recorded in this thread.
+  return static_cast<int32_t>(id);
+}
+
 }  // namespace platform
 }  // namespace paddle