Skip to content

Commit 2ce7992

Browse files
authored
[rocprofiler-systems]: Add OpenSHMEM API tracing (#3189)
Add optional OpenSHMEM API tracing using the GOTCHA-based backend. Tracing is off by default and must be enabled via config.
1 parent 30aeeee commit 2ce7992

24 files changed

Lines changed: 2008 additions & 14 deletions

File tree

projects/rocprofiler-systems/CHANGELOG.md

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,27 @@
44

55
Full documentation for ROCm Systems Profiler is available at [https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/).
66

7-
## ROCm Systems Profiler 1.5.0 for ROCm x.y.z (unreleased)
8-
9-
### Changed
10-
11-
- Simplify categorizing like pmc_info events by removing the _<idx> from the "symbol" field. ie., "JpegAct_0" -> "JpegAct".
12-
- Added `libhsa-runtime64.so` and `libomp.so` to the internal library exclusion list for runtime instrumentation to prevent instrumenting of runtime library internals.
13-
- RCCL implementation refactored with `production_pmc_registrar` for improved testability and separation of concerns.
14-
- Unsupported RCCL datatypes now gracefully return 0 with `LOG_WARNING` instead of aborting profiler, allowing continued profiling with newer RCCL versions.
7+
## ROCm Systems Profiler 1.5.0 for ROCm 7.12.0
158

169
### Added
1710

1811
- Per-GPU RCCL communication data counters (Send/Recv) in `rocpd` output with multi-GPU device attribution using `ncclCommCuDevice` API.
1912
- Presets profiles that configure the rocprofiler-system tools for common profiling scenarios, offering optimized configurations for specific use cases.
2013
- `rocprof-sys-attach` CLI tool for attaching to and profiling running processes via rocprofiler-sdk rocattach API (experimental).
14+
- Support for OpenSHMEM API tracing via `ROCPROFSYS_USE_SHMEM=ON` configuration setting.
2115

22-
### Resolved issues
16+
### Changed
2317

24-
- Fixed an issue where JPEG engine activity PMC events were not being collected for MI35X systems. Only the first 32 JPEG engines were being collected.
18+
- Simplify categorizing like pmc_info events by removing the _<idx> from the "symbol" field. ie., "JpegAct_0" -> "JpegAct".
19+
- Added `libhsa-runtime64.so` and `libomp.so` to the internal library exclusion list for runtime instrumentation to prevent instrumenting of runtime library internals.
20+
- RCCL implementation refactored with `production_pmc_registrar` for improved testability and separation of concerns.
21+
- Unsupported RCCL datatypes now gracefully return 0 with `LOG_WARNING` instead of aborting profiler, allowing continued profiling with newer RCCL versions.
2522

2623
### Resolved issues
2724

28-
- Fixed MPI perfetto trace file merging when using trace cache mode with `ROCPROFSYS_PERFETTO_COMBINE_TRACES=ON`. Previously, each MPI rank would produce a separate trace file; now all ranks' traces are correctly merged into a single output file.
25+
- Fixed an issue where JPEG engine activity PMC events were not being collected for MI35X systems. Only the first 32 JPEG engines were being collected.
26+
- Fixed MPI perfetto trace file merging when using trace cache mode with `ROCPROFSYS_PERFETTO_COMBINE_TRACES=ON`.
27+
Previously, each MPI rank would produce a separate trace file; now all ranks' traces are correctly merged into a single output file.
2928

3029
## ROCm Systems Profiler 1.4.0 for ROCm 7.11.0
3130

projects/rocprofiler-systems/examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,5 @@ add_subdirectory(jpegdecode)
9090
add_subdirectory(roctx)
9191
add_subdirectory(thread-limit)
9292
add_subdirectory(transferBench)
93+
add_subdirectory(shmem)
9394
add_subdirectory(scratch-memory)
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright (c) Advanced Micro Devices, Inc.
2+
# SPDX-License-Identifier: MIT
3+
4+
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
5+
6+
if(ROCPROFSYS_DISABLE_EXAMPLES)
7+
get_filename_component(_DIR ${CMAKE_CURRENT_LIST_DIR} NAME)
8+
if(
9+
"rocprofiler-systems-shmem-examples" IN_LIST ROCPROFSYS_DISABLE_EXAMPLES
10+
OR ${_DIR} IN_LIST ROCPROFSYS_DISABLE_EXAMPLES
11+
)
12+
return()
13+
endif()
14+
endif()
15+
16+
# oshcc is the OpenSHMEM C compiler wrapper (e.g. from Open MPI); it provides
17+
# the correct include paths and libraries for shmem.h and OpenSHMEM runtime.
18+
# Prefer /usr/bin/oshcc (system package) to avoid version mismatch with oshrun.
19+
find_program(OSHCC_EXECUTABLE NAMES oshcc PATHS /usr/bin NO_DEFAULT_PATH)
20+
if(NOT OSHCC_EXECUTABLE)
21+
find_program(OSHCC_EXECUTABLE NAMES oshcc)
22+
endif()
23+
if(NOT OSHCC_EXECUTABLE)
24+
if("${CMAKE_PROJECT_NAME}" STREQUAL "rocprofiler-systems" AND "$ENV{ROCPROFSYS_CI}")
25+
set(_MSG_TYPE STATUS) # don't generate warnings during CI
26+
else()
27+
set(_MSG_TYPE AUTHOR_WARNING)
28+
endif()
29+
message(
30+
${_MSG_TYPE}
31+
"oshcc not found. Skipping SHMEM examples (shmem_hello, shmem_pingpong)."
32+
)
33+
return()
34+
else()
35+
message(STATUS "oshcc found: ${OSHCC_EXECUTABLE}")
36+
endif()
37+
38+
# Build with oshcc so shmem.h and OpenSHMEM libs are found. Output to project build root.
39+
set(_SHMEM_BINDIR ${CMAKE_BINARY_DIR})
40+
set(_SHMEM_SRCDIR ${CMAKE_CURRENT_SOURCE_DIR})
41+
set(_SHMEM_HELLO_EXE ${_SHMEM_BINDIR}/shmem_hello${CMAKE_EXECUTABLE_SUFFIX})
42+
set(_SHMEM_PINGPONG_EXE ${_SHMEM_BINDIR}/shmem_pingpong${CMAKE_EXECUTABLE_SUFFIX})
43+
44+
add_custom_command(
45+
OUTPUT ${_SHMEM_HELLO_EXE}
46+
COMMAND ${OSHCC_EXECUTABLE} ${_SHMEM_SRCDIR}/shmem_hello.c -o ${_SHMEM_HELLO_EXE}
47+
DEPENDS ${_SHMEM_SRCDIR}/shmem_hello.c
48+
COMMENT "Building shmem_hello with oshcc"
49+
VERBATIM
50+
)
51+
add_custom_target(shmem_hello ALL DEPENDS ${_SHMEM_HELLO_EXE})
52+
53+
add_custom_command(
54+
OUTPUT ${_SHMEM_PINGPONG_EXE}
55+
COMMAND
56+
${OSHCC_EXECUTABLE} ${_SHMEM_SRCDIR}/shmem_pingpong.c -o ${_SHMEM_PINGPONG_EXE}
57+
DEPENDS ${_SHMEM_SRCDIR}/shmem_pingpong.c
58+
COMMENT "Building shmem_pingpong with oshcc"
59+
VERBATIM
60+
)
61+
add_custom_target(shmem_pingpong ALL DEPENDS ${_SHMEM_PINGPONG_EXE})
62+
63+
# Export paths for rocprof-sys-shmem-tests.cmake (same project)
64+
set(ROCPROFSYS_SHMEM_HELLO_EXE
65+
"${_SHMEM_HELLO_EXE}"
66+
CACHE INTERNAL
67+
"Path to shmem_hello executable"
68+
)
69+
set(ROCPROFSYS_SHMEM_PINGPONG_EXE
70+
"${_SHMEM_PINGPONG_EXE}"
71+
CACHE INTERNAL
72+
"Path to shmem_pingpong executable"
73+
)
74+
75+
if(ROCPROFSYS_INSTALL_EXAMPLES)
76+
install(
77+
PROGRAMS ${_SHMEM_HELLO_EXE} ${_SHMEM_PINGPONG_EXE}
78+
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
79+
COMPONENT rocprofiler-systems-examples
80+
OPTIONAL
81+
)
82+
endif()
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright (c) Advanced Micro Devices, Inc.
2+
// SPDX-License-Identifier: MIT
3+
4+
#include <shmem.h>
5+
#include <stdio.h>
6+
7+
int
8+
main(void)
9+
{
10+
// 1. Initialize the SHMEM environment
11+
shmem_init();
12+
13+
// 2. Get basic information about my place in the world
14+
int me = shmem_my_pe(); // my Processing Element (like rank)
15+
int npes = shmem_n_pes(); // total number of PEs (like size)
16+
17+
// Simple output from every process
18+
printf("Hello from PE %d of %d\n", me, npes);
19+
20+
// -------------------------------------------------------------------------
21+
// A bit more interesting: simple point-to-point communication
22+
// -------------------------------------------------------------------------
23+
24+
// Allocate one integer in the symmetric heap (visible to all PEs)
25+
int* value = (int*) shmem_malloc(sizeof(int));
26+
27+
// Everyone initializes their own slot to their PE number
28+
*value = me;
29+
30+
// Barrier so everyone has written their value
31+
shmem_barrier_all();
32+
33+
// Each PE reads the value from the next PE (with wrap-around)
34+
int next_pe = (me + 1) % npes;
35+
int received;
36+
37+
// Blocking get from next PE
38+
shmem_int_get(&received, value, 1, next_pe);
39+
40+
printf("PE %d received value %d from PE %d\n", me, received, next_pe);
41+
42+
// Optional: make sure remote memory operations are completed
43+
shmem_quiet();
44+
45+
// -------------------------------------------------------------------------
46+
// Cleanup
47+
// -------------------------------------------------------------------------
48+
shmem_free(value);
49+
50+
// Finalize SHMEM (required in modern versions)
51+
shmem_finalize();
52+
53+
return 0;
54+
}

0 commit comments

Comments
 (0)