Skip to content

Commit 63e6d92

Browse files
cj401-amdzahiqbal
authored andcommitted
rocprof-sdk addition,
upstream PR: openxla/pull/29769 Squash following commits.. Update rocprofiler-sdk (v3) along with roctracer (v1) for rocm-jaxlib-v0.6.0 (#302) * update for integration of rocprofiler-sdk (along with roctracer as a backup based on bazel_options from CLI) (cherry picked from commit 7775dd0) use VLOG(2) to replace LOG(INFO), so PGLE has no verbose info (#357) (cherry picked from commit 5950125) update with kernel details for rocm-7.x (#364) * update with kernel details for rocm-7.x (cherry picked from commit 5597c0d) update to remove previously hard-coded rocprofiler-sdk path (#369) * update to remove previously hard-coded rocprofiler-sdk path and add skip_rocprofiler_sdk to avoid loading `rocprofiler-sdk` (cherry picked from commit ff74b5f)
1 parent 6b03bc3 commit 63e6d92

File tree

21 files changed

+2562
-1086
lines changed

21 files changed

+2562
-1086
lines changed

build_tools/lint/tags.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
"multi_gpu_h100": (
9999
"Used by `xla_test` to signal that multiple H100s are needed."
100100
),
101+
"skip_rocprofiler_sdk": "used to skip rocmtracer test as it calls rocprofiler-sdk via rocprofiler_force_configure",
101102
}
102103

103104

build_tools/rocm/run_xla.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,4 +147,4 @@ bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
147147
# clean up bazel disk_cache
148148
bazel shutdown \
149149
--disk_cache=${BAZEL_DISK_CACHE_DIR} \
150-
--experimental_disk_cache_gc_max_size=${BAZEL_DISK_CACHE_SIZE}
150+
--experimental_disk_cache_gc_max_size=${BAZEL_DISK_CACHE_SIZE}

third_party/gpus/rocm/BUILD.tpl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,19 @@ cc_library(
349349
deps = [":rocm_config"],
350350
)
351351

352+
cc_library(
353+
name = "rocprofiler-sdk",
354+
srcs = glob(["%{rocm_root}/lib/librocprofiler-sdk*.so*"]),
355+
hdrs = glob(["%{rocm_root}/include/rocprofiler-sdk/**"]),
356+
include_prefix = "rocm",
357+
includes = [
358+
"%{rocm_root}/include/",
359+
],
360+
strip_include_prefix = "%{rocm_root}",
361+
visibility = ["//visibility:public"],
362+
deps = [":rocm_config"],
363+
)
364+
352365
cc_library(
353366
name = "rocsolver",
354367
srcs = glob(["%{rocm_root}/lib/librocsolver*.so*"]),

third_party/gpus/rocm_configure.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ def _find_libs(repository_ctx, rocm_config, miopen_path, rccl_path, bash_bin):
333333
("rocsolver", rocm_config.rocm_toolkit_path),
334334
("hipfft", rocm_config.rocm_toolkit_path),
335335
("rocrand", rocm_config.rocm_toolkit_path),
336+
("rocprofiler-sdk", rocm_config.rocm_toolkit_path),
336337
]
337338
]
338339
if int(rocm_config.rocm_version_number) >= 40500:

xla/backends/gpu/runtime/buffer_comparator_test.cc

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ limitations under the License.
3838
#include "tsl/platform/ml_dtypes.h"
3939
#include "tsl/platform/test.h"
4040

41+
#include <hip/hip_runtime_api.h>
42+
4143
namespace xla {
4244
namespace gpu {
4345
namespace {
@@ -403,6 +405,84 @@ TEST_F(BufferComparatorTest, BF16) {
403405
.value());
404406
}
405407

408+
// ROCm-only: CI-safe very-large compare using *device* memory and U8 type.
409+
// - No HMM / mapped host memory
410+
// - 64-bit indexing stress (N > 2^31)
411+
// - Alignment-friendly aliasing offset
412+
TEST_F(BufferComparatorTest, VeryLargeArray_Device_U8_Aligned) {
413+
// Force 64-bit indexing without exceeding ~4 GiB VRAM.
414+
constexpr PrimitiveType number_type = U8;
415+
using NT = primitive_util::PrimitiveTypeToNative<number_type>::type;
416+
static_assert(sizeof(NT) == 1, "This test expects 8-bit elements.");
417+
418+
// N just above 2^31 so 32-bit indexing overflows.
419+
const uint64_t element_count = (1ull << 31) + 4096; // 2,147,488,744
420+
// Shift rhs by a cacheline (64 B) to keep accesses aligned even if
421+
// vectorized.
422+
constexpr size_t kShiftBytes = 64;
423+
424+
// Total device allocation: N + shift (≈ 2.15 GiB)
425+
const size_t bytes_total = static_cast<size_t>(element_count) + kShiftBytes;
426+
427+
// Create stream.
428+
auto stream_or = stream_exec_->CreateStream();
429+
ASSERT_TRUE(stream_or.ok()) << stream_or.status();
430+
std::unique_ptr<se::Stream> stream = std::move(stream_or.value());
431+
432+
// Single device allocation of (N + shift) bytes.
433+
void* dev_ptr = nullptr;
434+
hipError_t herr = hipMalloc(&dev_ptr, bytes_total);
435+
if (herr == hipErrorMemoryAllocation) {
436+
GTEST_SKIP() << "Insufficient VRAM for ~"
437+
<< (bytes_total / (1024.0 * 1024 * 1024))
438+
<< " GiB device alloc.";
439+
}
440+
ASSERT_EQ(herr, hipSuccess) << "hipMalloc failed: " << static_cast<int>(herr);
441+
442+
// Views:
443+
// base: whole allocation
444+
// lhs : [0 .. N-1]
445+
// rhs : [shift .. shift + N-1]
446+
se::DeviceMemoryBase base(dev_ptr, bytes_total);
447+
se::DeviceMemoryBase lhs(dev_ptr,
448+
static_cast<size_t>(element_count)); // bytes for U8
449+
se::DeviceMemoryBase rhs(static_cast<char*>(dev_ptr) + kShiftBytes,
450+
lhs.size());
451+
452+
// Initialize with zeros (MemZero works for any size/alignment).
453+
ASSERT_TRUE(stream->MemZero(&base, bytes_total).ok());
454+
ASSERT_TRUE(stream->BlockHostUntilDone().ok());
455+
456+
// Comparator: force device path (exercise kernel indexing).
457+
BufferComparator comparator(
458+
ShapeUtil::MakeShape(number_type, {static_cast<int64_t>(element_count)}),
459+
/*tolerance=*/0.0, /*verbose=*/false, /*run_host_compare=*/false);
460+
461+
// Pass 1: both views read zeros -> equal.
462+
{
463+
auto eq_or = comparator.CompareEqual(stream.get(), lhs, rhs);
464+
ASSERT_TRUE(eq_or.ok()) << eq_or.status();
465+
EXPECT_TRUE(eq_or.value());
466+
}
467+
468+
// Flip the very last element of rhs via stream-sequenced memcpy
469+
// (host->device).
470+
const NT new_val = static_cast<NT>(0xA5);
471+
se::DeviceMemoryBase rhs_last(
472+
static_cast<char*>(rhs.opaque()) + (element_count - 1), sizeof(NT));
473+
ASSERT_TRUE(stream->Memcpy(&rhs_last, &new_val, sizeof(NT)).ok());
474+
ASSERT_TRUE(stream->BlockHostUntilDone().ok());
475+
476+
// Pass 2: must detect tail mismatch.
477+
{
478+
auto eq_or = comparator.CompareEqual(stream.get(), lhs, rhs);
479+
ASSERT_TRUE(eq_or.ok()) << eq_or.status();
480+
EXPECT_FALSE(eq_or.value());
481+
}
482+
483+
ASSERT_EQ(hipFree(dev_ptr), hipSuccess);
484+
}
485+
406486
} // namespace
407487
} // namespace gpu
408488
} // namespace xla

xla/backends/profiler/gpu/BUILD

Lines changed: 149 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -359,11 +359,58 @@ cc_library(
359359
],
360360
)
361361

362+
config_setting(
363+
name = "use_v1",
364+
values = {"define": "xla_rocm_profiler=v1"},
365+
)
366+
367+
config_setting(
368+
name = "use_rocprofiler_sdk",
369+
values = {"define": "xla_rocm_profiler=v3"},
370+
)
371+
372+
cc_library(
373+
name = "rocm_profiler_backend_cfg",
374+
defines = select({
375+
":use_v1": ["XLA_GPU_ROCM_TRACER_BACKEND=1"],
376+
":use_rocprofiler_sdk": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
377+
"//conditions:default": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
378+
}),
379+
visibility = ["//visibility:public"],
380+
)
381+
382+
cc_library(
383+
name = "rocm_tracer_utils",
384+
srcs = ["rocm_tracer_utils.cc"],
385+
hdrs = ["rocm_tracer_utils.h"],
386+
tags = [
387+
"gpu",
388+
"manual",
389+
"rocm-only",
390+
],
391+
deps = [
392+
"//xla/tsl/profiler/backends/cpu:annotation_stack",
393+
"//xla/tsl/profiler/utils:time_utils",
394+
"//xla/tsl/profiler/utils:math_utils",
395+
"@com_google_absl//absl/strings:string_view",
396+
"@com_google_absl//absl/container:flat_hash_map",
397+
"@com_google_absl//absl/container:flat_hash_set",
398+
"@com_google_absl//absl/container:node_hash_map",
399+
"@com_google_absl//absl/container:node_hash_set",
400+
"@tsl//tsl/platform:env_time",
401+
"@tsl//tsl/platform:env",
402+
"@tsl//tsl/platform:errors",
403+
"@tsl//tsl/platform:logging",
404+
"@tsl//tsl/platform:macros",
405+
"@local_config_rocm//rocm:rocprofiler-sdk",
406+
],
407+
visibility = ["//visibility:public"],
408+
)
409+
362410
cc_library(
363411
name = "rocm_collector",
364412
srcs = ["rocm_collector.cc"],
365413
hdrs = ["rocm_collector.h"],
366-
# copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
367414
tags = [
368415
"gpu",
369416
"rocm-only",
@@ -372,6 +419,8 @@ cc_library(
372419
"manual",
373420
]),
374421
deps = [
422+
":rocm_tracer_utils",
423+
":rocm_profiler_backend_cfg",
375424
"//xla/stream_executor/rocm:roctracer_wrapper",
376425
"//xla/tsl/profiler/backends/cpu:annotation_stack",
377426
"//xla/tsl/profiler/utils:parse_annotation",
@@ -396,26 +445,53 @@ cc_library(
396445
"@tsl//tsl/platform:types",
397446
"@tsl//tsl/profiler/lib:profiler_factory",
398447
"@tsl//tsl/profiler/lib:profiler_interface",
448+
"@local_config_rocm//rocm:rocprofiler-sdk",
399449
],
400450
)
401451

402452
cc_library(
403-
name = "rocm_tracer",
404-
srcs = ["rocm_tracer.cc"],
405-
hdrs = ["rocm_tracer.h"],
406-
# copybara:uncomment compatible_with = ["//buildenv/target:non_prod"],
453+
name = "rocm_tracer_headers",
454+
hdrs = [
455+
"rocm_tracer.h",
456+
"rocm_profiler_sdk.h",
457+
"rocm_tracer_v1.h",
458+
],
407459
tags = [
408460
"gpu",
461+
"manual",
409462
"rocm-only",
410-
] + if_google([
411-
# TODO(b/360374983): Remove this tag once the target can be built without --config=rocm.
463+
],
464+
# PROPAGATE the layout macro to every dependent TU:
465+
defines = select({
466+
":use_v1": ["XLA_GPU_ROCM_TRACER_BACKEND=1"],
467+
":use_rocprofiler_sdk": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
468+
"//conditions:default": ["XLA_GPU_ROCM_TRACER_BACKEND=3"],
469+
}),
470+
visibility = ["//visibility:public"],
471+
)
472+
473+
cc_library(
474+
name = "rocm_tracer_impl",
475+
srcs = select({
476+
":use_v1": ["rocm_tracer_v1.cc"],
477+
":use_rocprofiler_sdk": ["rocm_profiler_sdk.cc"],
478+
"//conditions:default": ["rocm_profiler_sdk.cc"],
479+
}),
480+
tags = [
481+
"gpu",
412482
"manual",
413-
]),
483+
"rocm-only",
484+
],
414485
deps = [
486+
":rocm_tracer_headers",
415487
":rocm_collector",
416488
"//xla/stream_executor/rocm:roctracer_wrapper",
417489
"//xla/tsl/profiler/backends/cpu:annotation_stack",
418490
"//xla/tsl/profiler/utils:time_utils",
491+
"//xla/tsl/profiler/utils:xplane_builder",
492+
"//xla/tsl/profiler/utils:xplane_schema",
493+
"//xla/tsl/profiler/utils:xplane_utils",
494+
"//xla/tsl/util:env_var",
419495
"@com_google_absl//absl/container:fixed_array",
420496
"@com_google_absl//absl/container:flat_hash_map",
421497
"@com_google_absl//absl/container:flat_hash_set",
@@ -432,9 +508,74 @@ cc_library(
432508
"@tsl//tsl/platform:status",
433509
"@tsl//tsl/platform:thread_annotations",
434510
"@tsl//tsl/platform:types",
511+
"@tsl//tsl/profiler/lib:profiler_factory",
512+
"@tsl//tsl/profiler/lib:profiler_interface",
513+
],
514+
)
515+
516+
cc_library(
517+
name = "rocm_tracer",
518+
tags = [
519+
"gpu",
520+
"manual",
521+
"rocm-only",
522+
],
523+
deps = [":rocm_tracer_headers", ":rocm_tracer_impl"],
524+
visibility = ["//visibility:public"],
525+
)
526+
527+
# upstream it's called xla_cc_test as no GPU involved.
528+
xla_test(
529+
name = "rocm_tracer_test",
530+
size = "small",
531+
srcs = ["rocm_tracer_test.cc"],
532+
tags = [
533+
"gpu",
534+
"rocm-only",
535+
"skip_rocprofiler_sdk", # due to rocprofiler-sdk's rocprofiler_force_configure
536+
] + if_google([
537+
# Optional: only run internally if ROCm config is enabled
538+
"manual",
539+
]),
540+
deps = [
541+
":rocm_tracer",
542+
":rocm_tracer_utils",
543+
"//xla/tsl/profiler/utils:xplane_builder",
544+
"@com_google_absl//absl/container:flat_hash_map",
545+
"@com_google_googletest//:gtest_main",
546+
"@tsl//tsl/platform:status_matchers",
547+
"@tsl//tsl/platform:test",
548+
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
435549
],
436550
)
437551

552+
xla_test(
553+
name = "rocm_collector_test",
554+
size = "small",
555+
srcs = ["rocm_collector_test.cc"],
556+
tags = [
557+
"gpu",
558+
"rocm-only",
559+
] + if_google([
560+
"manual",
561+
]),
562+
deps = [
563+
":rocm_collector",
564+
":rocm_tracer_utils",
565+
"//xla/tsl/profiler/utils:xplane_builder",
566+
"@com_google_absl//absl/container:flat_hash_map",
567+
"@com_google_googletest//:gtest_main",
568+
"@tsl//tsl/platform:env_time",
569+
"@tsl//tsl/platform:status_matchers",
570+
"@tsl//tsl/platform:test",
571+
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
572+
"@tsl//tsl/platform:env",
573+
"@tsl//tsl/platform:errors",
574+
"@tsl//tsl/platform:logging",
575+
"@tsl//tsl/platform:macros",
576+
],
577+
)
578+
438579
cc_library(
439580
name = "nvtx_utils",
440581
srcs = ["nvtx_utils.cc"],

0 commit comments

Comments
 (0)