Skip to content

Commit 5a8d276

Browse files
committed
[rocprofiler-sdk] On-demand GPU profile queue creation/destruction
Create and destroy GPU hardware profile queues per start_context/stop_context cycle instead of permanently at SDK initialization. This eliminates ~70ms TTFT overhead caused by 8 persistent GPU queues in the MES runlist. Changes: - agent_cache.hpp: Add destroy_device_counting_service_queue() declaration - agent_cache.cpp: Defer queue creation from constructor to start_context; implement destroy_device_counting_service_queue() with hsa_queue_destroy - device_counting.cpp: Call init_device_counting_service_queue() in start_agent_ctx before queue check; destroy signals, reset packet, and destroy queue in stop_agent_ctx after completion Test results (10 rounds, Qwen3-VL-235B on MI300X x8 with old in-process RDC): - Baseline (no RDC): Mean TTFT 889.67ms - Old RDC (permanent queues): Mean TTFT 972.63ms (+83ms, +9.3%) - On-demand queues: Mean TTFT 854.60ms (-35ms, -3.9% vs baseline)
1 parent 2ce7992 commit 5a8d276

File tree

3 files changed

+34
-1
lines changed

3 files changed

+34
-1
lines changed

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/device_counting.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ start_agent_ctx(const context::context* ctx)
432432
break;
433433
}
434434

435+
// On-demand: create the profile queue now (destroyed in stop_agent_ctx)
436+
agent->init_device_counting_service_queue(*hsa::get_core_table(), *hsa::get_amd_ext_table());
437+
435438
// But if we have an agent cache, we need a profile queue.
436439
if(!agent->profile_queue())
437440
{
@@ -603,6 +606,19 @@ stop_agent_ctx(const context::context* ctx)
603606
{
604607
counters::counter_collection_device_unlock(agent->get_rocp_agent());
605608
}
609+
610+
// On-demand cleanup: destroy signals, reset packet, destroy queue
611+
if(callback_data.completion.handle != 0) {
612+
hsa::get_core_table()->hsa_signal_destroy_fn(callback_data.completion);
613+
callback_data.completion.handle = 0;
614+
}
615+
if(callback_data.start_signal.handle != 0) {
616+
hsa::get_core_table()->hsa_signal_destroy_fn(callback_data.start_signal);
617+
callback_data.start_signal.handle = 0;
618+
}
619+
callback_data.packet.reset();
620+
callback_data.queue = nullptr;
621+
agent->destroy_device_counting_service_queue();
606622
}
607623

608624
agent_ctx.status.exchange(rocprofiler::context::device_counting_service::state::DISABLED);

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/agent_cache.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <stdexcept>
2828

2929
#include "lib/rocprofiler-sdk/context/context.hpp"
30+
#include "lib/rocprofiler-sdk/hsa/hsa.hpp"
3031

3132
namespace
3233
{
@@ -171,7 +172,7 @@ AgentCache::AgentCache(const rocprofiler_agent_t* rocp_agent,
171172
{
172173
init_cpu_pool(ext_table, *this);
173174
init_gpu_pool(ext_table, *this);
174-
init_device_counting_service_queue(api, ext_table);
175+
// init_device_counting_service_queue(api, ext_table); // Deferred to start_context for on-demand queue
175176
} catch(std::runtime_error& e)
176177
{
177178
ROCP_WARNING << fmt::format(
@@ -181,5 +182,20 @@ AgentCache::AgentCache(const rocprofiler_agent_t* rocp_agent,
181182
}
182183
}
183184

185+
186+
void
187+
AgentCache::destroy_device_counting_service_queue() const
188+
{
189+
static std::mutex m_mutex;
190+
std::lock_guard<std::mutex> lock(m_mutex);
191+
192+
if(!m_profile_queue) return;
193+
194+
auto* api = hsa::get_core_table();
195+
if(api && api->hsa_queue_destroy_fn)
196+
api->hsa_queue_destroy_fn(m_profile_queue);
197+
m_profile_queue = nullptr;
198+
}
199+
184200
} // namespace hsa
185201
} // namespace rocprofiler

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/agent_cache.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class AgentCache
7474
size_t index() const { return m_index; }
7575

7676
void init_device_counting_service_queue(const CoreApiTable& api, const AmdExtTable& ext) const;
77+
void destroy_device_counting_service_queue() const;
7778
bool operator==(const rocprofiler_agent_t*) const;
7879
bool operator==(hsa_agent_t) const;
7980

0 commit comments

Comments
 (0)