Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 10 additions & 20 deletions xla/backends/profiler/gpu/rocm_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,21 +226,11 @@ void PerDeviceCollector::CreateXEvent(const RocmTracerEvent& event,

if (event.type == RocmTracerEventType::Kernel &&
event.source == RocmTracerEventSource::Activity) {
RocmDeviceOccupancyParams params{};
params.attributes.maxThreadsPerBlock = INT_MAX;
params.attributes.numRegs =
static_cast<int>(event.kernel_info.registers_per_thread);
params.attributes.sharedSizeBytes =
event.kernel_info.static_shared_memory_usage;
// params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
// params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
params.attributes.maxDynamicSharedSizeBytes = 0;
params.block_size =
static_cast<int>(event.kernel_info.block_x * event.kernel_info.block_y *
event.kernel_info.block_z);

params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
params.func_ptr = event.kernel_info.func_ptr;
xevent.AddStatValue(
*plane->GetOrCreateStatMetadata(
GetStatTypeStr(StatType::kKernelDetails)),
*plane->GetOrCreateStatMetadata(ToXStat(event.kernel_info,
/*occupancy_pct*/ 0)));
} else if (event.type == RocmTracerEventType::MemcpyH2D ||
event.type == RocmTracerEventType::MemcpyD2H ||
event.type == RocmTracerEventType::MemcpyD2D ||
Expand Down Expand Up @@ -601,22 +591,22 @@ std::vector<RocmTracerEvent> RocmTraceCollectorImpl::ApiActivityInfoExchange() {
api_event.stream_id = item.stream_id;
switch (api_event.type) {
case RocmTracerEventType::Kernel:
api_event.kernel_info = item.kernel_info;
aggregated_events.push_back(api_event);
break;
case RocmTracerEventType::Memset:
case RocmTracerEventType::MemoryAlloc:
case RocmTracerEventType::MemoryFree:
case RocmTracerEventType::Synchronization: {
case RocmTracerEventType::Synchronization:
aggregated_events.push_back(api_event);
break;
}
case RocmTracerEventType::MemcpyD2H:
case RocmTracerEventType::MemcpyH2D:
case RocmTracerEventType::MemcpyD2D:
case RocmTracerEventType::MemcpyOther: {
// api_event.memcpy_info.destination = item.device_id;
case RocmTracerEventType::MemcpyOther:
api_event.memcpy_info = item.memcpy_info;
aggregated_events.push_back(api_event);
break;
}
default:
OnEventsDropped("Missing API-Activity information exchange. Dropped!",
api_event.correlation_id);
Expand Down
32 changes: 21 additions & 11 deletions xla/backends/profiler/gpu/rocm_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,24 @@ using tsl::profiler::XSpace;

inline std::string ToXStat(const KernelDetails& kernel_info,
double occupancy_pct) {
return absl::StrCat(
"regs:", kernel_info.registers_per_thread,
" static_shared:", kernel_info.static_shared_memory_usage,
" dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
" grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
kernel_info.grid_z, " block:", kernel_info.block_x, ",",
kernel_info.block_y, ",", kernel_info.block_z,
" occ_pct:", occupancy_pct);
// on ROCM 6.4.x/7.0.0 rocprofiler sdk mixed up grid and workgroup size...
// only shows them with ROCm-7.x
uint32_t grid_x = kernel_info.workgroup_x != 0
? kernel_info.grid_x / kernel_info.workgroup_x
: 0,
grid_y = kernel_info.workgroup_y != 0
? kernel_info.grid_y / kernel_info.workgroup_y
: 0,
grid_z = kernel_info.workgroup_z != 0
? kernel_info.grid_z / kernel_info.workgroup_z
: 0;

return absl::StrCat(" grid:", grid_x, ",", grid_y, ",", grid_z,
" block:", kernel_info.workgroup_x, ",",
kernel_info.workgroup_y, ",", kernel_info.workgroup_z,
" private_mem:", kernel_info.private_segment_size,
" group_mem:", kernel_info.group_segment_size,
" occ_pct:", occupancy_pct);
}

struct RocmDeviceOccupancyParams {
Expand Down Expand Up @@ -178,8 +188,8 @@ class RocmTraceCollectorImpl : public RocmTraceCollector {

void OnEventsDropped(const std::string& reason,
uint32_t correlation_id) override {
LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
<< ",) : " << reason << ".";
VLOG(2) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
<< ",) : " << reason << ".";
}

private:
Expand Down Expand Up @@ -217,4 +227,4 @@ std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
} // namespace profiler
} // namespace xla

#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
5 changes: 2 additions & 3 deletions xla/backends/profiler/gpu/rocm_collector_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,8 @@ TEST_F(RocmCollectorSanitizerTest, TestAddKernelEventAndExport_TSANSafe) {
api_event.device_id = 0; // set explicitly
api_event.stream_id = 123;
api_event.kernel_info = {
/*.registers_per_thread=*/32,
/*.static_shared_memory_usage=*/1024,
/*.dynamic_shared_memory_usage=*/0,
/*.private_segment_size=*/32,
/*.group_segment_size=*/1024,
/*.block_x=*/256,
/*.block_y=*/1,
/*.block_z=*/1,
Expand Down
11 changes: 5 additions & 6 deletions xla/backends/profiler/gpu/rocm_profiler_sdk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,11 @@ void RocmTracer::KernelEvent(const rocprofiler_record_header_t* hdr,
trace_event->thread_id = rec.thread_id;
trace_event->stream_id = kinfo.queue_id.handle;
trace_event->kernel_info = KernelDetails{
.registers_per_thread = 0,
.static_shared_memory_usage = 0,
.dynamic_shared_memory_usage = 0,
.block_x = kinfo.workgroup_size.x,
.block_y = kinfo.workgroup_size.y,
.block_z = kinfo.workgroup_size.z,
.private_segment_size = kinfo.private_segment_size,
.group_segment_size = kinfo.group_segment_size,
.workgroup_x = kinfo.workgroup_size.x,
.workgroup_y = kinfo.workgroup_size.y,
.workgroup_z = kinfo.workgroup_size.z,
.grid_x = kinfo.grid_size.x,
.grid_y = kinfo.grid_size.y,
.grid_z = kinfo.grid_size.z,
Expand Down
29 changes: 14 additions & 15 deletions xla/backends/profiler/gpu/rocm_tracer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_UTILS_H_
#define XLA_BACKENDS_PROFILER_GPU_ROCM_UTILS_H_
#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
#define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_

#include <cstdint>
#include <cstring>
Expand Down Expand Up @@ -66,18 +66,17 @@ struct MemsetDetails {
};

struct KernelDetails {
// The number of registers used in this kernel.
uint32_t registers_per_thread;
// The amount of shared memory space used by a thread block.
uint32_t static_shared_memory_usage;
// The amount of dynamic memory space used by a thread block.
uint32_t dynamic_shared_memory_usage;
// X-dimension of a thread block.
uint32_t block_x;
// Y-dimension of a thread block.
uint32_t block_y;
// Z-dimension of a thread block.
uint32_t block_z;
// The amount of private memory used by kernel,
// number of register per thread (register spillage if > 0)
uint32_t private_segment_size;
// The amount of shared memory (SMEM)
uint32_t group_segment_size;
Comment thread
cj401-amd marked this conversation as resolved.
// X-dimension of a workgroup (grid.x*block.x)
uint32_t workgroup_x;
// Y-dimension of a workgroup (grid.x*block.x)
uint32_t workgroup_y;
// Z-dimension of a workgroup (grid.x*block.x)
uint32_t workgroup_z;
// X-dimension of a grid.
uint32_t grid_x;
// Y-dimension of a grid.
Expand Down Expand Up @@ -203,4 +202,4 @@ class AnnotationMap {
} // namespace profiler
} // namespace xla

#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_UTILS_H_
#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_UTILS_H_
4 changes: 3 additions & 1 deletion xla/service/gpu/transforms/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ load(
"if_gpu_is_configured",
)
load("//xla/tests:build_defs.bzl", "xla_test")
load("//xla/tsl:tsl.bzl", "if_google", "if_oss")
load("//xla/tsl:tsl.bzl", "if_google",
"if_oss", # @unused
)
load(
"//xla/tsl/platform/default:cuda_build_defs.bzl",
"if_cuda_is_configured",
Expand Down
1 change: 0 additions & 1 deletion xla/tsl/profiler/utils/xplane_schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,6 @@ bool IsInternalEvent(std::optional<int64_t> event_type) {
bool IsInternalStat(std::optional<int64_t> stat_type) {
if (!stat_type.has_value()) return false;
switch (*stat_type) {
case StatType::kKernelDetails:
case StatType::kProducerType:
case StatType::kProducerId:
case StatType::kConsumerType:
Expand Down