diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 6de220275d116..809814cbf71ef 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -139,7 +139,8 @@ variables in production code. | -------------------- | ------ | ----------- | | `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | EnableBuffers, MaxPoolSize [, MemType, MaxPoolableSize, Capacity, SlabMinSize]... | EnableBuffers enables pooling for SYCL buffers, default false. MaxPoolSize is the maximum size of the pool, default 0. MemType is host, device or shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for host and shared, 32KB for device. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 0. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared. | -| `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | +| `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of compute commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | +| `SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE` | Integer | Sets a preferred number of copy commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. | diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c71c1e9323d97..6f8c27c2b2890 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -825,7 +825,7 @@ bool _pi_queue::isInOrderQueue() const { pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, bool MakeAvailable) { bool UseCopyEngine = CommandList->second.isCopy(); - auto &ZeCommandListCache = (UseCopyEngine) + auto &ZeCommandListCache = UseCopyEngine ? this->Context->ZeCopyCommandListCache : this->Context->ZeComputeCommandListCache; @@ -903,11 +903,15 @@ typedef struct CommandListBatchConfig { bool dynamic() const { return Size == 0; } } zeCommandListBatchConfig; -static const zeCommandListBatchConfig ZeCommandListBatch = [] { +// Helper function to initialize static variables that holds batch config info +// for compute and copy command batching. +static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { zeCommandListBatchConfig Config{}; // default initialize // Default value of 0. This specifies to use dynamic batch size adjustment. - const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); + const auto BatchSizeStr = + (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE") + : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); if (BatchSizeStr) { pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); // Level Zero may only support a limted number of commands per command @@ -939,7 +943,11 @@ static const zeCommandListBatchConfig ZeCommandListBatch = [] { try { Val = std::stoi(BatchConfig.substr(Pos)); } catch (...) { - zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n"); + if (IsCopy) + zePrint( + "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n"); + else + zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n"); break; } switch (Ord) { @@ -961,42 +969,77 @@ static const zeCommandListBatchConfig ZeCommandListBatch = [] { default: die("Unexpected batch config"); } - zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n", - (int)Ord, (int)Val); + if (IsCopy) + zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param " + "#%d: %d\n", + (int)Ord, (int)Val); + else + zePrint( + "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n", + (int)Ord, (int)Val); }; } else { // Negative batch sizes are silently ignored. - zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n"); + if (IsCopy) + zePrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n"); + else + zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n"); } } return Config; +} + +// Static variable that holds batch config info for compute command batching. +static const zeCommandListBatchConfig ZeCommandListBatchComputeConfig = [] { + using IsCopy = bool; + return ZeCommandListBatchConfig(IsCopy{false}); +}(); + +// Static variable that holds batch config info for copy command batching. +static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] { + using IsCopy = bool; + return ZeCommandListBatchConfig(IsCopy{true}); }(); +_pi_queue::_pi_queue(ze_command_queue_handle_t Queue, + std::vector &CopyQueues, + pi_context Context, pi_device Device, + bool OwnZeCommandQueue, + pi_queue_properties PiQueueProperties) + : ZeComputeCommandQueue{Queue}, ZeCopyCommandQueues{CopyQueues}, + Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, + PiQueueProperties(PiQueueProperties) { + ComputeCommandBatch.OpenCommandList = CommandListMap.end(); + CopyCommandBatch.OpenCommandList = CommandListMap.end(); + ComputeCommandBatch.QueueBatchSize = + ZeCommandListBatchComputeConfig.startSize(); + CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize(); +} + // Retrieve an available command list to be used in a PI call // Caller must hold a lock on the Queue passed in. -pi_result _pi_context::getAvailableCommandList( - pi_queue Queue, pi_command_list_ptr_t &CommandList, bool PreferCopyEngine, - bool AllowBatching) { +pi_result +_pi_context::getAvailableCommandList(pi_queue Queue, + pi_command_list_ptr_t &CommandList, + bool UseCopyEngine, bool AllowBatching) { + auto &CommandBatch = + UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // Handle batching of commands // First see if there is an command-list open for batching commands // for this queue. - if (Queue->hasOpenCommandList()) { - // TODO: Batching of copy commands will be supported. - if (AllowBatching && !PreferCopyEngine) { - CommandList = Queue->OpenCommandList; + if (Queue->hasOpenCommandList(UseCopyEngine)) { + if (AllowBatching) { + CommandList = CommandBatch.OpenCommandList; return PI_SUCCESS; } - // If this command isn't allowed to be batched, then we need to // go ahead and execute what is already in the batched list, // and then go on to process this. On exit from executeOpenCommandList // OpenCommandList will be invalidated. - if (auto Res = Queue->executeOpenCommandList()) + if (auto Res = Queue->executeOpenCommandList(UseCopyEngine)) return Res; } - bool UseCopyEngine = - (!(Queue->isInOrderQueue()) || UseCopyEngineForInOrderQueue) && - PreferCopyEngine && Queue->Device->hasCopyEngine(); // Create/Reuse the command list, because in Level Zero commands are added to // the command lists, and later are then added to the command queue. @@ -1005,7 +1048,7 @@ pi_result _pi_context::getAvailableCommandList( _pi_result pi_result = PI_OUT_OF_RESOURCES; ZeStruct ZeFenceDesc; - auto &ZeCommandListCache = (UseCopyEngine) + auto &ZeCommandListCache = UseCopyEngine ? Queue->Context->ZeCopyCommandListCache : Queue->Context->ZeComputeCommandListCache; @@ -1035,7 +1078,7 @@ pi_result _pi_context::getAvailableCommandList( if (UseCopyEngine) ZeCopyCommandQueue = Queue->getZeCopyCommandQueue(&CopyQueueIndex); auto &ZeCommandQueue = - (UseCopyEngine) ? ZeCopyCommandQueue : Queue->ZeComputeCommandQueue; + UseCopyEngine ? ZeCopyCommandQueue : Queue->ZeComputeCommandQueue; ze_fence_handle_t ZeFence; ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); @@ -1091,8 +1134,8 @@ pi_result _pi_context::getAvailableCommandList( } ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = - (UseCopyEngine) ? ZeCopyCommandQueueGroupIndex - : Queue->Device->ZeComputeQueueGroupIndex; + UseCopyEngine ? ZeCopyCommandQueueGroupIndex + : Queue->Device->ZeComputeQueueGroupIndex; ZE_CALL(zeCommandListCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice, @@ -1101,7 +1144,7 @@ pi_result _pi_context::getAvailableCommandList( Queue->Device->Platform->ZeGlobalCommandListCount++; auto &ZeCommandQueue = - (UseCopyEngine) ? ZeCopyCommandQueue : Queue->ZeComputeCommandQueue; + UseCopyEngine ? ZeCopyCommandQueue : Queue->ZeComputeCommandQueue; ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert( std::pair( @@ -1112,53 +1155,68 @@ pi_result _pi_context::getAvailableCommandList( return pi_result; } -void _pi_queue::adjustBatchSizeForFullBatch() { +void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + auto &ZeCommandListBatchConfig = + IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; + pi_uint32 QueueBatchSize = CommandBatch.QueueBatchSize; // QueueBatchSize of 0 means never allow batching. - if (QueueBatchSize == 0 || !ZeCommandListBatch.dynamic()) + if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic()) return; - - NumTimesClosedFull += 1; + CommandBatch.NumTimesClosedFull += 1; // If the number of times the list has been closed early is low, and // the number of times it has been closed full is high, then raise // the batching size slowly. Don't raise it if it is already pretty // high. - if (NumTimesClosedEarly <= ZeCommandListBatch.NumTimesClosedEarlyThreshold && - NumTimesClosedFull > ZeCommandListBatch.NumTimesClosedFullThreshold) { - if (QueueBatchSize < ZeCommandListBatch.DynamicSizeMax) { - QueueBatchSize += ZeCommandListBatch.DynamicSizeStep; + if (CommandBatch.NumTimesClosedEarly <= + ZeCommandListBatchConfig.NumTimesClosedEarlyThreshold && + CommandBatch.NumTimesClosedFull > + ZeCommandListBatchConfig.NumTimesClosedFullThreshold) { + if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) { + QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep; zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize); } - NumTimesClosedEarly = 0; - NumTimesClosedFull = 0; + CommandBatch.NumTimesClosedEarly = 0; + CommandBatch.NumTimesClosedFull = 0; } + CommandBatch.QueueBatchSize = QueueBatchSize; } -void _pi_queue::adjustBatchSizeForPartialBatch() { +void _pi_queue::adjustBatchSizeForPartialBatch(bool IsCopy) { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + auto &ZeCommandListBatchConfig = + IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; + pi_uint32 QueueBatchSize = CommandBatch.QueueBatchSize; // QueueBatchSize of 0 means never allow batching. - if (QueueBatchSize == 0 || !ZeCommandListBatch.dynamic()) + if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic()) return; - - NumTimesClosedEarly += 1; + CommandBatch.NumTimesClosedEarly += 1; // If we are closing early more than about 3x the number of times // it is closing full, lower the batch size to the value of the // current open command list. This is trying to quickly get to a // batch size that will be able to be closed full at least once // in a while. - if (NumTimesClosedEarly > (NumTimesClosedFull + 1) * 3) { - QueueBatchSize = OpenCommandList->second.size() - 1; + if (CommandBatch.NumTimesClosedEarly > + (CommandBatch.NumTimesClosedFull + 1) * 3) { + QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1; if (QueueBatchSize < 1) QueueBatchSize = 1; zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize); - NumTimesClosedEarly = 0; - NumTimesClosedFull = 0; + CommandBatch.NumTimesClosedEarly = 0; + CommandBatch.NumTimesClosedFull = 0; } } pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, bool IsBlocking, bool OKToBatchCommand) { + int Index = CommandList->second.CopyQueueIndex; + bool UseCopyEngine = (Index != -1); + if (UseCopyEngine) + zePrint("Command list to be executed on copy engine %d\n", Index); + // If the current LastCommandEvent is the nullptr, then it means // either that no command has ever been issued to the queue // or it means that the LastCommandEvent has been signalled and @@ -1178,25 +1236,27 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // as indicated by !ZeCommandListBatch.dynamic(), then just ignore // CurrentlyEmpty as we want to strictly follow the batching the user // specified. - if (OKToBatchCommand && this->isBatchingAllowed() && - (!ZeCommandListBatch.dynamic() || !CurrentlyEmpty)) { - - if (hasOpenCommandList() && OpenCommandList != CommandList) + auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch; + auto &ZeCommandListBatchConfig = UseCopyEngine + ? ZeCommandListBatchCopyConfig + : ZeCommandListBatchComputeConfig; + if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) && + (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) { + + if (hasOpenCommandList(UseCopyEngine) && + CommandBatch.OpenCommandList != CommandList) die("executeCommandList: OpenCommandList should be equal to" "null or CommandList"); - if (CommandList->second.size() < QueueBatchSize) { - OpenCommandList = CommandList; + if (CommandList->second.size() < CommandBatch.QueueBatchSize) { + CommandBatch.OpenCommandList = CommandList; return PI_SUCCESS; } - adjustBatchSizeForFullBatch(); - OpenCommandList = CommandListMap.end(); + adjustBatchSizeForFullBatch(UseCopyEngine); + CommandBatch.OpenCommandList = CommandListMap.end(); } - int Index = CommandList->second.CopyQueueIndex; - bool UseCopyEngine = (Index != -1); - if (UseCopyEngine) - zePrint("Command list to be executed on copy engine\n"); + // If available, get the copy command queue assosciated with // ZeCommandList ze_command_queue_handle_t ZeCopyCommandQueue = nullptr; @@ -1205,7 +1265,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, return Res; } auto &ZeCommandQueue = - (UseCopyEngine) ? ZeCopyCommandQueue : ZeComputeCommandQueue; + UseCopyEngine ? ZeCopyCommandQueue : ZeComputeCommandQueue; // Scope of the lock must be till the end of the function, otherwise new mem // allocs can be created between the moment when we made a snapshot and the // moment when command list is closed and executed. But mutex is locked only @@ -1250,6 +1310,7 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, ZE_CALL(zeCommandListClose, (CommandList->first)); // Offload command list to the GPU for asynchronous execution auto ZeCommandList = CommandList->first; + zePrint("Calling zeCommandQueueExecuteCommandLists with Index = %d\n", Index); ZE_CALL(zeCommandQueueExecuteCommandLists, (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence)); @@ -1261,8 +1322,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, return PI_SUCCESS; } -bool _pi_queue::isBatchingAllowed() { - return (this->QueueBatchSize > 0 && ((ZeSerialize & ZeSerializeBlock) == 0)); +bool _pi_queue::isBatchingAllowed(bool IsCopy) const { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + return (CommandBatch.QueueBatchSize > 0 && + ((ZeSerialize & ZeSerializeBlock) == 0)); } pi_result _pi_queue::getOrCreateCopyCommandQueue( @@ -1353,7 +1416,6 @@ _pi_queue::getZeCopyCommandQueue(int *CopyQueueIndex, else *CopyQueueIndex = LastUsedCopyCommandQueueIndex + 1; LastUsedCopyCommandQueueIndex = *CopyQueueIndex; - zePrint("Note: CopyQueueIndex = %d\n", *CopyQueueIndex); if (CopyQueueGroupIndex) // First queue in the vector of copy queues is the main copy queue, // if available. Otherwise it's a link copy queue. @@ -1367,13 +1429,14 @@ _pi_queue::getZeCopyCommandQueue(int *CopyQueueIndex, return ZeCopyCommandQueue; } -pi_result _pi_queue::executeOpenCommandList() { +pi_result _pi_queue::executeOpenCommandList(bool IsCopy) { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; // If there are any commands still in the open command list for this // queue, then close and execute that command list now. - if (hasOpenCommandList()) { - adjustBatchSizeForPartialBatch(); - auto Res = executeCommandList(OpenCommandList, false, false); - OpenCommandList = CommandListMap.end(); + if (hasOpenCommandList(IsCopy)) { + adjustBatchSizeForPartialBatch(IsCopy); + auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false); + CommandBatch.OpenCommandList = CommandListMap.end(); return Res; } @@ -1430,7 +1493,7 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); - if (auto Res = Queue->executeOpenCommandList()) + if (auto Res = Queue->executeAllOpenCommandLists()) return Res; } @@ -2871,9 +2934,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, PI_ASSERT(Queue, PI_INVALID_QUEUE); try { - *Queue = - new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context, - Device, ZeCommandListBatch.startSize(), true, Properties); + *Queue = new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context, + Device, true, Properties); } catch (const std::bad_alloc &) { return PI_OUT_OF_HOST_MEMORY; } catch (...) { @@ -2940,7 +3002,7 @@ pi_result piQueueRelease(pi_queue Queue) { // // It is possible to get to here and still have an open command list // if no wait or finish ever occurred for this queue. - if (auto Res = Queue->executeOpenCommandList()) + if (auto Res = Queue->executeAllOpenCommandLists()) return Res; // Make sure all commands get executed. @@ -2999,8 +3061,14 @@ static pi_result QueueRelease(pi_queue Queue, pi_queue LockedQueue) { } Queue->ZeCopyCommandQueues.clear(); - zePrint("piQueueRelease NumTimesClosedFull %d, NumTimesClosedEarly %d\n", - Queue->NumTimesClosedFull, Queue->NumTimesClosedEarly); + zePrint("piQueueRelease(compute) NumTimesClosedFull %d, " + "NumTimesClosedEarly %d\n", + Queue->ComputeCommandBatch.NumTimesClosedFull, + Queue->ComputeCommandBatch.NumTimesClosedEarly); + zePrint("piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly " + "%d\n", + Queue->CopyCommandBatch.NumTimesClosedFull, + Queue->CopyCommandBatch.NumTimesClosedEarly); } } if (RefCountZero) @@ -3017,7 +3085,7 @@ pi_result piQueueFinish(pi_queue Queue) { std::lock_guard lock(Queue->PiQueueMutex); // execute any command list that may still be open. - if (auto Res = Queue->executeOpenCommandList()) + if (auto Res = Queue->executeAllOpenCommandLists()) return Res; ZE_CALL(zeHostSynchronize, (Queue->ZeComputeCommandQueue)); @@ -3059,8 +3127,8 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, // TODO: see what we can do to correctly initialize PI queue for // compute vs. copy Level-Zero queue. std::vector ZeroCopyQueues; - *Queue = new _pi_queue(ZeQueue, ZeroCopyQueues, Context, Device, - ZeCommandListBatch.startSize(), OwnNativeHandle); + *Queue = + new _pi_queue(ZeQueue, ZeroCopyQueues, Context, Device, OwnNativeHandle); return PI_SUCCESS; } @@ -4618,7 +4686,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // Get a new command list to be used on this call pi_command_list_ptr_t CommandList{}; if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, false /* PreferCopyEngine */, + Queue, CommandList, false /* UseCopyEngine */, true /* AllowBatching */)) return Res; @@ -4756,8 +4824,8 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent) { bool OkToBatch = true; pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - false, OkToBatch)) + if (auto Res = Queue->Context->getAvailableCommandList( + Queue, CommandList, false /* UseCopyEngine */, OkToBatch)) return Res; ZE_CALL(zeCommandListAppendWaitOnEvents, @@ -4838,9 +4906,17 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, // Only do the execute of the open command list if the event that // is being queried and event that is to be signalled by something // currently in that open command list. - if (Event->Queue->hasOpenCommandList() && - Event->Queue->OpenCommandList->first == Event->ZeCommandList) { - if (auto Res = Event->Queue->executeOpenCommandList()) + using IsCopy = bool; + if (Event->Queue->hasOpenCommandList(IsCopy{false}) && + Event->Queue->ComputeCommandBatch.OpenCommandList->first == + Event->ZeCommandList) { + if (auto Res = Event->Queue->executeOpenCommandList(IsCopy{false})) + return Res; + } + if (Event->Queue->hasOpenCommandList(IsCopy{true}) && + Event->Queue->CopyCommandBatch.OpenCommandList->first == + Event->ZeCommandList) { + if (auto Res = Event->Queue->executeOpenCommandList(IsCopy{true})) return Res; } } @@ -5060,7 +5136,6 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { if (NumEvents && !EventList) { return PI_INVALID_EVENT; } - // Make sure to add all host-visible "proxy" event signals if needed. // This ensures that all signalling commands are submitted below and // thus proxy events can be waited without a deadlock. @@ -5071,7 +5146,6 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) return Res; } - // Submit dependent open command lists for execution, if any for (uint32_t I = 0; I < NumEvents; I++) { auto Queue = EventList[I]->Queue; @@ -5080,12 +5154,11 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { std::lock_guard lock(Queue->PiQueueMutex); if (Queue->RefCount > 0) { - if (auto Res = Queue->executeOpenCommandList()) + if (auto Res = Queue->executeAllOpenCommandLists()) return Res; } } } - for (uint32_t I = 0; I < NumEvents; I++) { ze_event_handle_t ZeEvent = EventList[I]->getHostVisibleEvent(); if (!ZeEvent) @@ -5098,7 +5171,6 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { // sooner in case run-time is not calling piEventRelease soon enough. EventList[I]->cleanup(); } - return PI_SUCCESS; } @@ -5501,6 +5573,11 @@ pi_result piEnqueueMemBufferReadRect( } // extern "C" +bool _pi_queue::useCopyEngine(bool PreferCopyEngine) const { + return (!isInOrderQueue() || UseCopyEngineForInOrderQueue) && + PreferCopyEngine && Device->hasCopyEngine(); +} + // Shared by all memory read/write/copy PI interfaces. // PI interfaces must not have queue's mutex locked on entry. static pi_result enqueueMemCopyHelper(pi_command_type CommandType, @@ -5512,7 +5589,6 @@ static pi_result enqueueMemCopyHelper(pi_command_type CommandType, pi_event *Event, bool PreferCopyEngine) { PI_ASSERT(Queue, PI_INVALID_QUEUE); PI_ASSERT(Event, PI_INVALID_EVENT); - // Lock automatically releases when this goes out of scope. std::lock_guard lock(Queue->PiQueueMutex); @@ -5523,8 +5599,13 @@ static pi_result enqueueMemCopyHelper(pi_command_type CommandType, // Get a new command list to be used on this call pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - PreferCopyEngine)) + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + if (auto Res = Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, OkToBatch)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -5551,7 +5632,8 @@ static pi_result enqueueMemCopyHelper(pi_command_type CommandType, pi_cast(ZeEvent)); printZeEventList(WaitList); - if (auto Res = Queue->executeCommandList(CommandList, BlockingWrite)) + if (auto Res = + Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch)) return Res; return PI_SUCCESS; @@ -5580,8 +5662,11 @@ static pi_result enqueueMemCopyRectHelper( // Get a new command list to be used on this call pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - PreferCopyEngine)) + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + if (auto Res = Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, OkToBatch)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -5647,7 +5732,7 @@ static pi_result enqueueMemCopyRectHelper( zePrint("calling zeCommandListAppendBarrier() with Event %#lx\n", pi_cast(ZeEvent)); - if (auto Res = Queue->executeCommandList(CommandList, Blocking)) + if (auto Res = Queue->executeCommandList(CommandList, Blocking, OkToBatch)) return Res; return PI_SUCCESS; @@ -5781,8 +5866,11 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, PI_INVALID_VALUE); pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - PreferCopyEngine)) + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + if (auto Res = Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, OkToBatch)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -5812,7 +5900,7 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, // Execute command list asynchronously, as the event will be used // to track down its completion. - if (auto Res = Queue->executeCommandList(CommandList)) + if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch)) return Res; return PI_SUCCESS; @@ -6149,8 +6237,11 @@ static pi_result enqueueMemImageCommandHelper( // Get a new command list to be used on this call pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - PreferCopyEngine)) + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + if (auto Res = Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, OkToBatch)) return Res; ze_event_handle_t ZeEvent = nullptr; @@ -6257,7 +6348,7 @@ static pi_result enqueueMemImageCommandHelper( return PI_INVALID_OPERATION; } - if (auto Res = Queue->executeCommandList(CommandList, IsBlocking)) + if (auto Res = Queue->executeCommandList(CommandList, IsBlocking, OkToBatch)) return Res; return PI_SUCCESS; @@ -7078,10 +7169,10 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, // Get a new command list to be used on this call pi_command_list_ptr_t CommandList{}; - // TODO: Change PreferCopyEngine argument to 'true' once L0 backend + // TODO: Change UseCopyEngine argument to 'true' once L0 backend // support is added if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, false /* PreferCopyEngine */)) + Queue, CommandList, false /* UseCopyEngine */)) return Res; // TODO: do we need to create a unique command type for this? @@ -7136,11 +7227,11 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, // Get a new command list to be used on this call pi_command_list_ptr_t CommandList{}; - // PreferCopyEngine is set to 'false' here. + // UseCopyEngine is set to 'false' here. // TODO: Additional analysis is required to check if this operation will // run faster on copy engines. if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, false /* PreferCopyEngine */)) + Queue, CommandList, false /* UseCopyEngine */)) return Res; // TODO: do we need to create a unique command type for this? diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 86d3c5f13f6fc..689c47ee14a10 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -531,12 +531,14 @@ struct _pi_context : _pi_object { // caller must pass a command queue to create a new fence for the new command // list if a command list/fence pair is not available. All Command Lists & // associated fences are destroyed at Device Release. + // If UseCopyEngine is true, the command will eventually be executed in a + // copy engine. Otherwise, the command will be executed in a compute engine. // If AllowBatching is true, then the command list returned may already have // command in it, if AllowBatching is false, any open command lists that // already exist in Queue will be closed and executed. pi_result getAvailableCommandList(pi_queue Queue, pi_command_list_ptr_t &CommandList, - bool PreferCopyCommandList = false, + bool UseCopyEngine = false, bool AllowBatching = false); // Get index of the free slot in the available pool. If there is no available @@ -603,14 +605,8 @@ struct _pi_context : _pi_object { struct _pi_queue : _pi_object { _pi_queue(ze_command_queue_handle_t Queue, std::vector &CopyQueues, - pi_context Context, pi_device Device, pi_uint32 BatchSize, - bool OwnZeCommandQueue, pi_queue_properties PiQueueProperties = 0) - : ZeComputeCommandQueue{Queue}, - ZeCopyCommandQueues{CopyQueues}, Context{Context}, Device{Device}, - QueueBatchSize{BatchSize}, OwnZeCommandQueue{OwnZeCommandQueue}, - PiQueueProperties(PiQueueProperties) { - OpenCommandList = CommandListMap.end(); - } + pi_context Context, pi_device Device, bool OwnZeCommandQueue, + pi_queue_properties PiQueueProperties = 0); // Level Zero compute command queue handle. ze_command_queue_handle_t ZeComputeCommandQueue; @@ -621,6 +617,11 @@ struct _pi_queue : _pi_object { // link copy engines, if available. std::vector ZeCopyCommandQueues; + // This function considers multiple factors including copy engine + // availability and user preference and returns a boolean that is used to + // specify if copy engine will eventually be used for a particular command. + bool useCopyEngine(bool PreferCopyEngine = true) const; + // This function will check if a Ze copy command queue is available in // ZeCopyCommandQueues at index 'Index'. // If available, it will return the queue. Otherwise, it will create a new @@ -672,33 +673,43 @@ struct _pi_queue : _pi_object { // for execution. std::vector KernelsToBeSubmitted; - // Approximate number of commands that are allowed to be batched for - // this queue. - // Added this member to the queue rather than using a global variable - // so that future implementation could use heuristics to change this on - // a queue specific basis. And by putting it in the queue itself, this - // is thread safe because of the locking of the queue that occurs. - pi_uint32 QueueBatchSize = {0}; - // Indicates if we own the ZeCommandQueue or it came from interop that // asked to not transfer the ownership to SYCL RT. bool OwnZeCommandQueue; - // These two members are used to keep track of how often the - // batching closes and executes a command list before reaching the - // QueueBatchSize limit, versus how often we reach the limit. - // This info might be used to vary the QueueBatchSize value. - pi_uint32 NumTimesClosedEarly = {0}; - pi_uint32 NumTimesClosedFull = {0}; - // Map of all command lists used in this queue. pi_command_list_map_t CommandListMap; - // Open command list field for batching commands into this queue. - pi_command_list_ptr_t OpenCommandList{}; - bool hasOpenCommandList() const { - return OpenCommandList != CommandListMap.end(); - } + // Helper data structure to hold all variables related to batching + typedef struct CommandBatch { + // These two members are used to keep track of how often the + // batching closes and executes a command list before reaching the + // QueueComputeBatchSize limit, versus how often we reach the limit. + // This info might be used to vary the QueueComputeBatchSize value. + pi_uint32 NumTimesClosedEarly = {0}; + pi_uint32 NumTimesClosedFull = {0}; + + // Open command list fields for batching commands into this queue. + pi_command_list_ptr_t OpenCommandList{}; + + // Approximate number of commands that are allowed to be batched for + // this queue. + // Added this member to the queue rather than using a global variable + // so that future implementation could use heuristics to change this on + // a queue specific basis. And by putting it in the queue itself, this + // is thread safe because of the locking of the queue that occurs. + pi_uint32 QueueBatchSize = {0}; + } command_batch; + + // ComputeCommandBatch holds data related to batching of non-copy commands. + // CopyCommandBatch holds data related to batching of copy commands. + command_batch ComputeCommandBatch, CopyCommandBatch; + + // Returns true if any commands for this queue are allowed to + // be batched together. + // For copy commands, IsCopy is set to 'true'. + // For non-copy commands, IsCopy is set to 'false'. + bool isBatchingAllowed(bool IsCopy) const; // Keeps the properties of this queue. pi_queue_properties PiQueueProperties; @@ -706,17 +717,17 @@ struct _pi_queue : _pi_object { // Returns true if the queue is a in-order queue. bool isInOrderQueue() const; - // Returns true if any commands for this queue are allowed to - // be batched together. - bool isBatchingAllowed(); - // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. - void adjustBatchSizeForFullBatch(); + // For copy commands, IsCopy is set to 'true'. + // For non-copy commands, IsCopy is set to 'false'. + void adjustBatchSizeForFullBatch(bool IsCopy); // adjust the queue's batch size, knowing that the current command list // is being closed with only a partial batch of commands. - void adjustBatchSizeForPartialBatch(); + // For copy commands, IsCopy is set to 'true'. + // For non-copy commands, IsCopy is set to 'false'. + void adjustBatchSizeForPartialBatch(bool IsCopy); // Resets the Command List and Associated fence in the ZeCommandListFenceMap. // If the reset command list should be made available, then MakeAvailable @@ -725,6 +736,14 @@ struct _pi_queue : _pi_object { pi_result resetCommandList(pi_command_list_ptr_t CommandList, bool MakeAvailable); + // Returns true if an OpenCommandList has commands that need to be submitted. + // If IsCopy is 'true', then the OpenCommandList containing copy commands is + // checked. Otherwise, the OpenCommandList containing compute commands is + // checked. + bool hasOpenCommandList(bool IsCopy) const { + auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch; + return CommandBatch.OpenCommandList != CommandListMap.end(); + } // Attach a command list to this queue, close, and execute it. // Note that this command list cannot be appended to after this. // The "IsBlocking" tells if the wait for completion is required. @@ -738,9 +757,23 @@ struct _pi_queue : _pi_object { bool OKToBatchCommand = false); // If there is an open command list associated with this queue, - // close it, execute it, and reset ZeOpenCommandList, ZeCommandListFence, - // and ZeOpenCommandListSize. - pi_result executeOpenCommandList(); + // close it, execute it, and reset the corresponding OpenCommandList. + // If IsCopy is 'true', then the OpenCommandList containing copy commands is + // executed. Otherwise OpenCommandList containing compute commands is + // executed. + pi_result executeOpenCommandList(bool IsCopy); + + // Wrapper function to execute both OpenCommandLists (Copy and Compute). + // This wrapper is helpful when all 'open' commands need to be executed. + // Call-sites instances: piQuueueFinish, piQueueRelease, etc. + pi_result executeAllOpenCommandLists() { + using IsCopy = bool; + if (auto Res = executeOpenCommandList(IsCopy{false})) + return Res; + if (auto Res = executeOpenCommandList(IsCopy{true})) + return Res; + return PI_SUCCESS; + } // Besides each PI object keeping a total reference count in // _pi_object::RefCount we keep special track of the queue *external*