Skip to content

Commit 4b7a300

Browse files
authored
Merge pull request #814 from CHIP-SPV/dyn-q-prof
OpenCL: Use non-profiling queue, switch to profiling when needed
2 parents c320990 + 2a91636 commit 4b7a300

4 files changed

Lines changed: 167 additions & 44 deletions

File tree

docs/Using.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ When set to `0`, chipStar will compile all device modules at the runtime
4949
initialization. Default setting is `1` meaning the device modules are
5050
compiled just before kernel launches.
5151

52+
#### CHIP\_OCL\_DISABLE\_QUEUE\_PROFILING
53+
54+
A debug option for forcing queue profiling to be disabled in the
55+
OpenCL backend. The default setting is `0`.
56+
5257
### Disabling GPU hangcheck
5358

5459
Note that long-running GPU compute kernels can trigger hang detection mechanism in the GPU driver, which will cause the kernel execution to be terminated and the runtime will report an error. Consult the documentation of your GPU driver on how to disable this hangcheck.

src/CHIPDriver.hh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ private:
230230
std::string JitFlags_ = CHIP_DEFAULT_JIT_FLAGS;
231231
unsigned long L0EventTimeout_ = 0;
232232
int L0CollectEventsTimeout_ = 0;
233+
bool OCLDisableQueueProfiling_ = false;
233234

234235
public:
235236
EnvVars() {
@@ -252,6 +253,7 @@ public:
252253

253254
return L0EventTimeout_ * 1e9;
254255
}
256+
bool getOCLDisableQueueProfiling() const { return OCLDisableQueueProfiling_; }
255257

256258
private:
257259
void parseEnvironmentVariables() {
@@ -282,6 +284,12 @@ private:
282284

283285
if (!readEnvVar("CHIP_L0_EVENT_TIMEOUT").empty())
284286
L0EventTimeout_ = parseInt("CHIP_L0_EVENT_TIMEOUT");
287+
288+
constexpr char DisableQProfilingEnv[] = "CHIP_OCL_DISABLE_QUEUE_PROFILING";
289+
if (!readEnvVar(DisableQProfilingEnv).empty()) {
290+
OCLDisableQueueProfiling_ = parseBoolean(DisableQProfilingEnv);
291+
logDebug("{}={}", DisableQProfilingEnv, OCLDisableQueueProfiling_);
292+
}
285293
}
286294

287295
std::string_view parseJitFlags(const std::string &StrIn) {

src/backend/OpenCL/CHIPBackendOpenCL.cc

Lines changed: 126 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -654,8 +654,11 @@ void CHIPQueueOpenCL::recordEvent(chipstar::Event *ChipEvent) {
654654
logTrace("chipstar::Queue::recordEvent({})", (void *)ChipEvent);
655655
auto ChipEventCL = static_cast<CHIPEventOpenCL *>(ChipEvent);
656656

657-
std::shared_ptr<chipstar::Event> LastEvent = getLastEvent();
658-
ChipEventCL->recordEventCopy(LastEvent ? LastEvent : enqueueMarker());
657+
// Need profiling command queue for querying timestamps for possible
658+
// later hipEventElapsedTime() calls.
659+
switchModeTo(Profiling);
660+
661+
ChipEventCL->recordEventCopy(enqueueMarker());
659662
ChipEventCL->setRecording();
660663
}
661664

@@ -1084,24 +1087,25 @@ void CHIPQueueOpenCL::MemMap(const chipstar::AllocationInfo *AllocInfo,
10841087
auto [SyncQueuesEventHandles, EventLocks] =
10851088
addDependenciesQueueSync(MemMapEvent);
10861089

1090+
auto QueueHandle = get()->get();
10871091
cl_int Status;
10881092
if (Type == chipstar::Queue::MEM_MAP_TYPE::HOST_READ) {
10891093
logDebug("CHIPQueueOpenCL::MemMap HOST_READ");
1090-
Status = clEnqueueSVMMap(ClQueue_->get(), CL_TRUE, CL_MAP_READ,
1091-
AllocInfo->HostPtr, AllocInfo->Size,
1092-
SyncQueuesEventHandles.size(),
1093-
SyncQueuesEventHandles.data(), MemMapEventNative);
1094+
Status =
1095+
clEnqueueSVMMap(QueueHandle, CL_TRUE, CL_MAP_READ, AllocInfo->HostPtr,
1096+
AllocInfo->Size, SyncQueuesEventHandles.size(),
1097+
SyncQueuesEventHandles.data(), MemMapEventNative);
10941098
} else if (Type == chipstar::Queue::MEM_MAP_TYPE::HOST_WRITE) {
10951099
logDebug("CHIPQueueOpenCL::MemMap HOST_WRITE");
1096-
Status = clEnqueueSVMMap(ClQueue_->get(), CL_TRUE, CL_MAP_WRITE,
1097-
AllocInfo->HostPtr, AllocInfo->Size,
1098-
SyncQueuesEventHandles.size(),
1099-
SyncQueuesEventHandles.data(), MemMapEventNative);
1100+
Status =
1101+
clEnqueueSVMMap(QueueHandle, CL_TRUE, CL_MAP_WRITE, AllocInfo->HostPtr,
1102+
AllocInfo->Size, SyncQueuesEventHandles.size(),
1103+
SyncQueuesEventHandles.data(), MemMapEventNative);
11001104
} else if (Type == chipstar::Queue::MEM_MAP_TYPE::HOST_READ_WRITE) {
11011105
logDebug("CHIPQueueOpenCL::MemMap HOST_READ_WRITE");
1102-
Status = clEnqueueSVMMap(ClQueue_->get(), CL_TRUE,
1103-
CL_MAP_READ | CL_MAP_WRITE, AllocInfo->HostPtr,
1104-
AllocInfo->Size, SyncQueuesEventHandles.size(),
1106+
Status = clEnqueueSVMMap(QueueHandle, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
1107+
AllocInfo->HostPtr, AllocInfo->Size,
1108+
SyncQueuesEventHandles.size(),
11051109
SyncQueuesEventHandles.data(), MemMapEventNative);
11061110
} else {
11071111
assert(0 && "Invalid MemMap Type");
@@ -1123,13 +1127,23 @@ void CHIPQueueOpenCL::MemUnmap(const chipstar::AllocationInfo *AllocInfo) {
11231127
addDependenciesQueueSync(MemMapEvent);
11241128

11251129
auto Status = clEnqueueSVMUnmap(
1126-
ClQueue_->get(), AllocInfo->HostPtr, SyncQueuesEventHandles.size(),
1130+
get()->get(), AllocInfo->HostPtr, SyncQueuesEventHandles.size(),
11271131
SyncQueuesEventHandles.data(),
11281132
std::static_pointer_cast<CHIPEventOpenCL>(MemMapEvent)->getNativePtr());
11291133
assert(Status == CL_SUCCESS);
11301134
}
11311135

1132-
cl::CommandQueue *CHIPQueueOpenCL::get() { return ClQueue_; }
1136+
cl::CommandQueue *CHIPQueueOpenCL::get() {
1137+
switch (QueueMode_) {
1138+
default:
1139+
assert(!"Unknown queue mode!");
1140+
// Fallthrough.
1141+
case Profiling:
1142+
return &ClProfilingQueue_;
1143+
case Regular:
1144+
return &ClRegularQueue_;
1145+
}
1146+
}
11331147

11341148
void CHIPQueueOpenCL::addCallback(hipStreamCallback_t Callback,
11351149
void *UserData) {
@@ -1187,7 +1201,7 @@ void CHIPQueueOpenCL::addCallback(hipStreamCallback_t Callback,
11871201
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
11881202

11891203
updateLastEvent(CallbackCompleted);
1190-
ClQueue_->flush();
1204+
get()->flush();
11911205

11921206
// Now the CB can start executing in the background:
11931207
clSetUserEventStatus(
@@ -1251,7 +1265,7 @@ CHIPQueueOpenCL::launchImpl(chipstar::ExecItem *ExecItem) {
12511265
auto [SyncQueuesEventHandles, EventLocks] =
12521266
addDependenciesQueueSync(LaunchEvent);
12531267
auto Status = clEnqueueNDRangeKernel(
1254-
ClQueue_->get(), KernelHandle, NumDims, GlobalOffset, Global, Local,
1268+
get()->get(), KernelHandle, NumDims, GlobalOffset, Global, Local,
12551269
SyncQueuesEventHandles.size(), SyncQueuesEventHandles.data(),
12561270
std::static_pointer_cast<CHIPEventOpenCL>(LaunchEvent)->getNativePtr());
12571271
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
@@ -1289,7 +1303,7 @@ CHIPQueueOpenCL::launchImpl(chipstar::ExecItem *ExecItem) {
12891303
}
12901304

12911305
CHIPQueueOpenCL::CHIPQueueOpenCL(chipstar::Device *ChipDevice, int Priority,
1292-
cl_command_queue Queue)
1306+
cl_command_queue QueueForInterop)
12931307
: chipstar::Queue(ChipDevice, chipstar::QueueFlags{}, Priority) {
12941308

12951309
cl_queue_priority_khr PrioritySelection;
@@ -1312,31 +1326,57 @@ CHIPQueueOpenCL::CHIPQueueOpenCL(chipstar::Device *ChipDevice, int Priority,
13121326
if (PrioritySelection != CL_QUEUE_PRIORITY_MED_KHR)
13131327
logWarn("CHIPQueueOpenCL is ignoring Priority value");
13141328

1315-
if (Queue)
1316-
ClQueue_ = new cl::CommandQueue(Queue);
1317-
else {
1318-
cl::Context *ClContext_ = ((CHIPContextOpenCL *)ChipContext_)->get();
1319-
cl::Device *ClDevice_ = ((CHIPDeviceOpenCL *)ChipDevice_)->get();
1329+
if (QueueForInterop) {
1330+
auto QFromUser = cl::CommandQueue(QueueForInterop);
1331+
13201332
cl_int Status;
1321-
// Adding priority breaks correctness?
1322-
// cl_queue_properties QueueProperties[] = {
1323-
// CL_QUEUE_PRIORITY_KHR, PrioritySelection, CL_QUEUE_PROPERTIES,
1324-
// CL_QUEUE_PROFILING_ENABLE, 0};
1325-
cl_queue_properties QueueProperties[] = {CL_QUEUE_PROPERTIES,
1326-
CL_QUEUE_PROFILING_ENABLE, 0};
1333+
cl_command_queue_properties QProps =
1334+
QFromUser.getInfo<CL_QUEUE_PROPERTIES>(&Status);
1335+
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS,
1336+
hipErrorInitializationError);
13271337

1328-
const cl_command_queue Q = clCreateCommandQueueWithProperties(
1329-
ClContext_->get(), ClDevice_->get(), QueueProperties, &Status);
1330-
ClQueue_ = new cl::CommandQueue(Q);
1338+
bool ProfilingIsEnabled = CL_QUEUE_PROFILING_ENABLE & QProps;
1339+
if (ProfilingIsEnabled) {
1340+
ClProfilingQueue_ = QFromUser;
1341+
QueueMode_ = Profiling;
1342+
} else {
1343+
logWarn("Provided queue has profiling disable. hipEventElapsedTime() "
1344+
"calls will fail.");
1345+
ClRegularQueue_ = QFromUser;
1346+
QueueMode_ = Regular;
1347+
}
1348+
UsedInInterOp = true;
1349+
} else {
1350+
cl::Context &ClContext =
1351+
*static_cast<CHIPContextOpenCL *>(ChipContext_)->get();
1352+
cl::Device &ClDevice = *static_cast<CHIPDeviceOpenCL *>(ChipDevice_)->get();
13311353

1354+
cl_queue_properties QPropsForProfiling[] = {CL_QUEUE_PROPERTIES,
1355+
CL_QUEUE_PROFILING_ENABLE, 0};
1356+
1357+
cl_int Status;
1358+
ClRegularQueue_ = cl::CommandQueue(
1359+
clCreateCommandQueueWithProperties(ClContext.get(), ClDevice.get(),
1360+
nullptr, &Status),
1361+
false);
13321362
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS,
13331363
hipErrorInitializationError);
1364+
1365+
if (!ChipEnvVars.getOCLDisableQueueProfiling()) {
1366+
ClProfilingQueue_ = cl::CommandQueue(
1367+
clCreateCommandQueueWithProperties(ClContext.get(), ClDevice.get(),
1368+
QPropsForProfiling, &Status),
1369+
false);
1370+
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS,
1371+
hipErrorInitializationError);
1372+
}
1373+
1374+
QueueMode_ = Regular;
13341375
}
13351376
}
13361377

13371378
CHIPQueueOpenCL::~CHIPQueueOpenCL() {
13381379
logTrace("~CHIPQueueOpenCL() {}", (void *)this);
1339-
delete ClQueue_;
13401380
}
13411381

13421382
std::shared_ptr<chipstar::Event>
@@ -1356,7 +1396,7 @@ CHIPQueueOpenCL::memCopyAsyncImpl(void *Dst, const void *Src, size_t Size) {
13561396
// a maker here, so we can return an event.
13571397
cl::Event MarkerEvent;
13581398
auto Status = clEnqueueMarker(
1359-
ClQueue_->get(),
1399+
get()->get(),
13601400
std::static_pointer_cast<CHIPEventOpenCL>(Event)->getNativePtr());
13611401
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
13621402
} else {
@@ -1365,8 +1405,8 @@ CHIPQueueOpenCL::memCopyAsyncImpl(void *Dst, const void *Src, size_t Size) {
13651405
#endif
13661406
auto [SyncQueuesEventHandles, EventLocks] = addDependenciesQueueSync(Event);
13671407
auto Status = ::clEnqueueSVMMemcpy(
1368-
ClQueue_->get(), CL_FALSE, Dst, Src, Size,
1369-
SyncQueuesEventHandles.size(), SyncQueuesEventHandles.data(),
1408+
get()->get(), CL_FALSE, Dst, Src, Size, SyncQueuesEventHandles.size(),
1409+
SyncQueuesEventHandles.data(),
13701410
std::static_pointer_cast<CHIPEventOpenCL>(Event)->getNativePtr());
13711411
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorRuntimeMemory);
13721412
}
@@ -1378,7 +1418,7 @@ void CHIPQueueOpenCL::finish() {
13781418
#ifdef CHIP_DUBIOUS_LOCKS
13791419
LOCK(Backend->DubiousLockOpenCL)
13801420
#endif
1381-
auto Status = ClQueue_->finish();
1421+
auto Status = get()->finish();
13821422
CHIPERR_CHECK_LOG_AND_ABORT(Status, CL_SUCCESS, hipErrorTbd);
13831423
}
13841424

@@ -1391,7 +1431,7 @@ CHIPQueueOpenCL::memFillAsyncImpl(void *Dst, size_t Size, const void *Pattern,
13911431
logTrace("clSVMmemfill {} / {} B\n", Dst, Size);
13921432
auto [SyncQueuesEventHandles, EventLocks] = addDependenciesQueueSync(Event);
13931433
int Retval = ::clEnqueueSVMMemFill(
1394-
ClQueue_->get(), Dst, Pattern, PatternSize, Size,
1434+
get()->get(), Dst, Pattern, PatternSize, Size,
13951435
SyncQueuesEventHandles.size(), SyncQueuesEventHandles.data(),
13961436
std::static_pointer_cast<CHIPEventOpenCL>(Event)->getNativePtr());
13971437
CHIPERR_CHECK_LOG_AND_THROW(Retval, CL_SUCCESS, hipErrorRuntimeMemory);
@@ -1433,8 +1473,12 @@ hipError_t CHIPQueueOpenCL::getBackendHandles(uintptr_t *NativeInfo,
14331473
return hipSuccess;
14341474
}
14351475

1476+
// Interop API expects one queue handle. Switch to the profiling queue which
1477+
// works for the interop user and for us in case we need profiling later.
1478+
switchModeTo(Profiling);
1479+
14361480
// Get queue handler
1437-
NativeInfo[4] = (uintptr_t)ClQueue_->get();
1481+
NativeInfo[4] = (uintptr_t)get()->get();
14381482

14391483
// Get context handler
14401484
cl::Context *Ctx = ((CHIPContextOpenCL *)ChipContext_)->get();
@@ -1481,15 +1525,15 @@ std::shared_ptr<chipstar::Event> CHIPQueueOpenCL::enqueueBarrierImpl(
14811525
SyncQueuesEventHandles.push_back(Event);
14821526
}
14831527
auto Status = clEnqueueBarrierWithWaitList(
1484-
ClQueue_->get(), SyncQueuesEventHandles.size(),
1528+
get()->get(), SyncQueuesEventHandles.size(),
14851529
SyncQueuesEventHandles.data(),
14861530
&(std::static_pointer_cast<CHIPEventOpenCL>(Event)->getNativeRef()));
14871531
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
14881532
} else {
14891533
// auto Status = ClQueue_->enqueueBarrierWithWaitList(nullptr, &Barrier);
14901534
auto [SyncQueuesEventHandles, EventLocks] = addDependenciesQueueSync(Event);
14911535
auto Status = clEnqueueBarrierWithWaitList(
1492-
ClQueue_->get(), SyncQueuesEventHandles.size(),
1536+
get()->get(), SyncQueuesEventHandles.size(),
14931537
SyncQueuesEventHandles.data(),
14941538
std::static_pointer_cast<CHIPEventOpenCL>(Event)->getNativePtr());
14951539
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
@@ -1502,6 +1546,47 @@ std::shared_ptr<chipstar::Event> CHIPQueueOpenCL::enqueueBarrierImpl(
15021546
return Event;
15031547
}
15041548

1549+
void CHIPQueueOpenCL::switchModeTo(QueueMode ToMode) {
1550+
if (QueueMode_ == ToMode)
1551+
return;
1552+
1553+
if (ToMode == Profiling && ChipEnvVars.getOCLDisableQueueProfiling()) {
1554+
logWarn("Queue profiling is disabled by request. hipEventElapsedTime() "
1555+
"will not work.");
1556+
return;
1557+
}
1558+
1559+
// Can't switch mode if the queue is used for interop. Otherwise, we
1560+
// may cause situations where commands enqueued via HIP and other
1561+
// API end up be executed out of order.
1562+
if (UsedInInterOp)
1563+
CHIPERR_LOG_AND_THROW("Can't switch mode for a queue used of interop.",
1564+
hipErrorTbd);
1565+
1566+
logDebug("Queue {}: switching mode to {}", (void *)this,
1567+
ToMode == Profiling ? "profiling" : "regular");
1568+
1569+
// Make sure commands are executed in-order across the current and
1570+
// switched-to queue.
1571+
auto &FromQ = *get();
1572+
auto &ToQ = ToMode == Profiling ? ClProfilingQueue_ : ClRegularQueue_;
1573+
assert(FromQ.get());
1574+
assert(ToQ.get());
1575+
1576+
cl_event SwitchEv;
1577+
cl_int Status;
1578+
Status = clEnqueueMarkerWithWaitList(FromQ.get(), 0, nullptr, &SwitchEv);
1579+
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
1580+
1581+
Status = clEnqueueBarrierWithWaitList(ToQ.get(), 1, &SwitchEv, nullptr);
1582+
CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
1583+
1584+
auto *ChipEv = new CHIPEventOpenCL(
1585+
static_cast<CHIPContextOpenCL *>(ChipContext_), SwitchEv);
1586+
updateLastEvent(std::shared_ptr<chipstar::Event>(ChipEv));
1587+
QueueMode_ = ToMode;
1588+
}
1589+
15051590
// CHIPExecItemOpenCL
15061591
//*************************************************************************
15071592

src/backend/OpenCL/CHIPBackendOpenCL.hh

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,32 @@ public:
256256
};
257257

258258
class CHIPQueueOpenCL : public chipstar::Queue {
259-
protected:
260-
// Any reason to make these private/protected?
261-
cl::CommandQueue *ClQueue_;
259+
// Profiling queue is known to slow down driver API calls on Intel
260+
// OpenCL implementation but profiling is only needed for acquiring
261+
// device timestamps for fulfilling hipEventElapsedTime() calls. At
262+
// start we use non-profiling queue and switch to profiling one when
263+
// needed (hipEventRecord() is called).
264+
//
265+
// TODO: Switch back to non-profiling queue when possible -
266+
// e.g. when HIP event object count drops to zero.
267+
//
268+
// An alternative would be modifying the queue's properties via
269+
// clSetCommandQueueProperty() but that's optional driver feature.
270+
cl::CommandQueue ClRegularQueue_;
271+
cl::CommandQueue ClProfilingQueue_;
272+
273+
// Enumeration for indicating the currently active queue.
274+
enum QueueMode {
275+
Regular, /// The non-profiling queue. ClRegularQueue_ is active.
276+
Profiling /// ClProfilingQueue_ is active.
277+
};
278+
279+
QueueMode QueueMode_ = Regular;
280+
281+
/// Set to true when this instance is shared with another API.
282+
bool UsedInInterOp = false;
262283

284+
protected:
263285
/**
264286
* @brief Map memory to device.
265287
*
@@ -319,6 +341,9 @@ public:
319341
memPrefetchImpl(const void *Ptr, size_t Count) override;
320342
std::pair<std::vector<cl_event>, chipstar::LockGuardVector>
321343
addDependenciesQueueSync(std::shared_ptr<chipstar::Event> TargetEvent);
344+
345+
private:
346+
void switchModeTo(QueueMode Mode);
322347
};
323348

324349
class CHIPKernelOpenCL : public chipstar::Kernel {

0 commit comments

Comments
 (0)