diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4ea9394f2034f..aaf492649be17 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -235,7 +235,7 @@ class DispatchHostTask { // Thus we employ read-lock of graph. { Scheduler &Sched = Scheduler::getInstance(); - std::shared_lock Lock(Sched.MGraphLock); + Scheduler::ReadLockT Lock(Sched.MGraphLock); std::vector Deps = MThisCmd->MDeps; @@ -481,7 +481,7 @@ void Command::makeTraceEventEpilog() { #endif } -void Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { +Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { const QueueImplPtr &WorkerQueue = getWorkerQueue(); const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); @@ -493,21 +493,25 @@ void Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { // call to waitInternal() is in waitForPreparedHostEvents() as it's called // from enqueue process functions MPreparedHostDepsEvents.push_back(DepEvent); - return; + return nullptr; } + Command *ConnectionCmd = nullptr; + // Do not add redundant event dependencies for in-order queues. if (Dep.MDepCommand && Dep.MDepCommand->getWorkerQueue() == WorkerQueue && WorkerQueue->has_property()) - return; + return nullptr; ContextImplPtr DepEventContext = DepEvent->getContextImpl(); // If contexts don't match we'll connect them using host task if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; - GB.connectDepEvent(this, DepEvent, Dep); + ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep); } else MPreparedDepsEvents.push_back(std::move(DepEvent)); + + return ConnectionCmd; } const ContextImplPtr &Command::getWorkerContext() const { @@ -516,9 +520,11 @@ const ContextImplPtr &Command::getWorkerContext() const { const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; } -void Command::addDep(DepDesc NewDep) { +Command *Command::addDep(DepDesc NewDep) { + Command *ConnectionCmd = nullptr; + if (NewDep.MDepCommand) { - processDepEvent(NewDep.MDepCommand->getEvent(), NewDep); + ConnectionCmd = processDepEvent(NewDep.MDepCommand->getEvent(), NewDep); } MDeps.push_back(NewDep); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -526,9 +532,11 @@ void Command::addDep(DepDesc NewDep) { NewDep.MDepCommand, (void *)NewDep.MDepRequirement->MSYCLMemObj, accessModeToString(NewDep.MDepRequirement->MAccessMode), true); #endif + + return ConnectionCmd; } -void Command::addDep(EventImplPtr Event) { +Command *Command::addDep(EventImplPtr Event) { #ifdef XPTI_ENABLE_INSTRUMENTATION // We need this for just the instrumentation, so guarding it will prevent // unused variable warnings when instrumentation is turned off @@ -538,7 +546,7 @@ void Command::addDep(EventImplPtr Event) { emitEdgeEventForEventDependence(Cmd, PiEventAddr); #endif - processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}); + return processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}); } void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) { @@ -732,7 +740,10 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, // Node event must be created before the dependent edge is added to this node, // so this call must be before the addDep() call. emitInstrumentationDataProxy(); - addDep(DepDesc(nullptr, getRequirement(), this)); + // "Nothing to depend on" + Command *ConnectionCmd = addDep(DepDesc(nullptr, getRequirement(), this)); + assert(ConnectionCmd == nullptr); + (void)ConnectionCmd; } void AllocaCommand::emitInstrumentationData() { @@ -795,7 +806,8 @@ void AllocaCommand::printDot(std::ostream &Stream) const { } AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, - AllocaCommandBase *ParentAlloca) + AllocaCommandBase *ParentAlloca, + std::vector &ToEnqueue) : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue), std::move(Req), /*LinkedAllocaCmd*/ nullptr), @@ -804,7 +816,10 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); - addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca)); + Command *ConnectionCmd = + addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca)); + if (ConnectionCmd) + ToEnqueue.push_back(ConnectionCmd); } void AllocaSubBufCommand::emitInstrumentationData() { @@ -1329,7 +1344,10 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, MRequirements.emplace_back(ReqRef); const Requirement *const StoredReq = &MRequirements.back(); - addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}); + // EmptyCommand is always host one, so we believe that result of addDep is nil + Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}); + assert(Cmd == nullptr && "Conection command should be null for EmptyCommand"); + (void)Cmd; } void EmptyCommand::emitInstrumentationData() { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 677cdc0cfcceb..f6262a907e8d6 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -107,9 +107,11 @@ class Command { Command(CommandType Type, QueueImplPtr Queue); - void addDep(DepDesc NewDep); + /// \return an optional connection cmd to enqueue + [[nodiscard]] Command *addDep(DepDesc NewDep); - void addDep(EventImplPtr Event); + /// \return an optional connection cmd to enqueue + [[nodiscard]] Command *addDep(EventImplPtr Event); void addUser(Command *NewUser) { MUsers.insert(NewUser); } @@ -204,13 +206,15 @@ class Command { /// Perform glueing of events from different contexts /// \param DepEvent event this commands should depend on /// \param Dep optional DepDesc to perform connection of events properly + /// \return returns an optional connection command to enqueue /// /// Glueing (i.e. connecting) will be performed if and only if DepEvent is /// not from host context and its context doesn't match to context of this /// command. Context of this command is fetched via getWorkerContext(). /// /// Optionality of Dep is set by Dep.MDepCommand not equal to nullptr. - void processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep); + [[nodiscard]] Command *processDepEvent(EventImplPtr DepEvent, + const DepDesc &Dep); /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; @@ -387,7 +391,8 @@ class AllocaCommand : public AllocaCommandBase { class AllocaSubBufCommand : public AllocaCommandBase { public: AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, - AllocaCommandBase *ParentAlloca); + AllocaCommandBase *ParentAlloca, + std::vector &ToEnqueue); void *getMemAllocation() const final; void printDot(std::ostream &Stream) const final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 440a7e6973e27..383bcee2f9d7e 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -172,9 +172,9 @@ MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) { return MemObject->MRecord.get(); } -MemObjRecord * -Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, - const Requirement *Req) { +MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( + const QueueImplPtr &Queue, const Requirement *Req, + std::vector &ToEnqueue) { SYCLMemObjI *MemObject = Req->MSYCLMemObj; MemObjRecord *Record = getMemObjRecord(MemObject); @@ -183,12 +183,14 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, const size_t LeafLimit = 8; LeavesCollection::AllocateDependencyF AllocateDependency = - [this](Command *Dependant, Command *Dependency, MemObjRecord *Record) { + [this](Command *Dependant, Command *Dependency, MemObjRecord *Record, + LeavesCollection::EnqueueListT &ToEnqueue) { // Add the old leaf as a dependency for the new one by duplicating one // of the requirements for the current record DepDesc Dep = findDepForRecord(Dependant, Record); Dep.MDepCommand = Dependency; - Dependant->addDep(Dep); + if (Command *ConnectionCmd = Dependant->addDep(Dep)) + ToEnqueue.push_back(ConnectionCmd); Dependency->addUser(Dependant); --(Dependency->MLeafCounter); }; @@ -212,7 +214,8 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, MemObject->MRecord.reset( new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency}); - getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr); + getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr, + ToEnqueue); } else MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(), LeafLimit, AllocateDependency}); @@ -235,18 +238,19 @@ void Scheduler::GraphBuilder::updateLeaves(const std::set &Cmds, } } -void Scheduler::GraphBuilder::addNodeToLeaves(MemObjRecord *Record, - Command *Cmd, - access::mode AccessMode) { +void Scheduler::GraphBuilder::addNodeToLeaves( + MemObjRecord *Record, Command *Cmd, access::mode AccessMode, + std::vector &ToEnqueue) { LeavesCollection &Leaves{AccessMode == access::mode::read ? Record->MReadLeaves : Record->MWriteLeaves}; - if (Leaves.push_back(Cmd)) + if (Leaves.push_back(Cmd, ToEnqueue)) ++Cmd->MLeafCounter; } UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( - MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue) { + MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, + std::vector &ToEnqueue) { AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Queue->getContextImplPtr()); assert(AllocaCmd && "There must be alloca for requirement!"); @@ -259,11 +263,14 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) { - UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}); + Command *ConnCmd = + UpdateCommand->addDep(DepDesc{Dep, StoredReq, AllocaCmd}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); Dep->addUser(UpdateCommand); } updateLeaves(Deps, Record, Req->MAccessMode); - addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode); + addNodeToLeaves(Record, UpdateCommand, Req->MAccessMode, ToEnqueue); return UpdateCommand; } @@ -296,11 +303,12 @@ static Command *insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, return MapCmd; } -Command *Scheduler::GraphBuilder::insertMemoryMove(MemObjRecord *Record, - Requirement *Req, - const QueueImplPtr &Queue) { +Command *Scheduler::GraphBuilder::insertMemoryMove( + MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, + std::vector &ToEnqueue) { - AllocaCommandBase *AllocaCmdDst = getOrCreateAllocaForReq(Record, Req, Queue); + AllocaCommandBase *AllocaCmdDst = + getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); if (!AllocaCmdDst) throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); @@ -369,17 +377,21 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(MemObjRecord *Record, } for (Command *Dep : Deps) { - NewCmd->addDep(DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}); + Command *ConnCmd = + NewCmd->addDep(DepDesc{Dep, NewCmd->getRequirement(), AllocaCmdDst}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); Dep->addUser(NewCmd); } updateLeaves(Deps, Record, access::mode::read_write); - addNodeToLeaves(Record, NewCmd, access::mode::read_write); + addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue); Record->MCurContext = Queue->getContextImplPtr(); return NewCmd; } Command *Scheduler::GraphBuilder::remapMemoryObject( - MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd) { + MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd, + std::vector &ToEnqueue) { assert(HostAllocaCmd->getQueue()->is_host() && "Host alloca command expected"); assert(HostAllocaCmd->MIsActive && "Active alloca command expected"); @@ -402,23 +414,30 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( &HostAllocaCmd->MMemAllocation, LinkedAllocaCmd->getQueue(), MapMode); for (Command *Dep : Deps) { - UnMapCmd->addDep(DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}); + Command *ConnCmd = UnMapCmd->addDep( + DepDesc{Dep, UnMapCmd->getRequirement(), LinkedAllocaCmd}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); Dep->addUser(UnMapCmd); } - MapCmd->addDep(DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}); + Command *ConnCmd = MapCmd->addDep( + DepDesc{UnMapCmd, MapCmd->getRequirement(), HostAllocaCmd}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); UnMapCmd->addUser(MapCmd); updateLeaves(Deps, Record, access::mode::read_write); - addNodeToLeaves(Record, MapCmd, access::mode::read_write); + addNodeToLeaves(Record, MapCmd, access::mode::read_write, ToEnqueue); Record->MHostAccess = MapMode; return MapCmd; } // The function adds copy operation of the up to date'st memory to the memory // pointed by Req. -Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) { - +Command * +Scheduler::GraphBuilder::addCopyBack(Requirement *Req, + std::vector &ToEnqueue) { QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue(); SYCLMemObjI *MemObj = Req->MSYCLMemObj; MemObjRecord *Record = getMemObjRecord(MemObj); @@ -443,12 +462,15 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) { MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release(); for (Command *Dep : Deps) { - MemCpyCmd->addDep(DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}); + Command *ConnCmd = MemCpyCmd->addDep( + DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); Dep->addUser(MemCpyCmd); } updateLeaves(Deps, Record, Req->MAccessMode); - addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode); + addNodeToLeaves(Record, MemCpyCmd, Req->MAccessMode, ToEnqueue); if (MPrintOptionsArray[AfterAddCopyBack]) printGraphAsDot("after_addCopyBack"); return MemCpyCmd; @@ -456,30 +478,34 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) { // The function implements SYCL host accessor logic: host accessor // should provide access to the buffer in user space. -Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req) { +Command * +Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, + std::vector &ToEnqueue) { const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue(); - MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req); + MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue); if (MPrintOptionsArray[BeforeAddHostAcc]) printGraphAsDot("before_addHostAccessor"); markModifiedIfWrite(Record, Req); AllocaCommandBase *HostAllocaCmd = - getOrCreateAllocaForReq(Record, Req, HostQueue); + getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue); if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(), Record->MCurContext)) { if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) - remapMemoryObject(Record, Req, HostAllocaCmd); + remapMemoryObject(Record, Req, HostAllocaCmd, ToEnqueue); } else - insertMemoryMove(Record, Req, HostQueue); + insertMemoryMove(Record, Req, HostQueue, ToEnqueue); - Command *UpdateHostAccCmd = insertUpdateHostReqCmd(Record, Req, HostQueue); + Command *UpdateHostAccCmd = + insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue); // Need empty command to be blocked until host accessor is destructed - EmptyCommand *EmptyCmd = addEmptyCmd( - UpdateHostAccCmd, {Req}, HostQueue, Command::BlockReason::HostAccessor); + EmptyCommand *EmptyCmd = + addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue, + Command::BlockReason::HostAccessor, ToEnqueue); Req->MBlockedCmd = EmptyCmd; @@ -490,13 +516,14 @@ Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req) { } Command *Scheduler::GraphBuilder::addCGUpdateHost( - std::unique_ptr CommandGroup, QueueImplPtr HostQueue) { + std::unique_ptr CommandGroup, QueueImplPtr HostQueue, + std::vector &ToEnqueue) { auto UpdateHost = static_cast(CommandGroup.get()); Requirement *Req = UpdateHost->getReqToUpdate(); - MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req); - return insertMemoryMove(Record, Req, HostQueue); + MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue); + return insertMemoryMove(Record, Req, HostQueue, ToEnqueue); } /// Start the search for the record from list of "leaf" commands and check if @@ -616,7 +643,8 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { // Note, creation of new allocation command can lead to the current context // (Record->MCurContext) change. AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( - MemObjRecord *Record, const Requirement *Req, QueueImplPtr Queue) { + MemObjRecord *Record, const Requirement *Req, QueueImplPtr Queue, + std::vector &ToEnqueue) { AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Queue->getContextImplPtr()); @@ -632,8 +660,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( /*Working with bytes*/ sizeof(char)); auto *ParentAlloca = - getOrCreateAllocaForReq(Record, &ParentRequirement, Queue); - AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca); + getOrCreateAllocaForReq(Record, &ParentRequirement, Queue, ToEnqueue); + AllocaCmd = new AllocaSubBufCommand(Queue, *Req, ParentAlloca, ToEnqueue); } else { const Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MMemoryRange, @@ -672,7 +700,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( DefaultHostQueue, FullReq, true /* InitFromUserData */, nullptr /* LinkedAllocaCmd */); Record->MAllocaCommands.push_back(HostAllocaCmd); - Record->MWriteLeaves.push_back(HostAllocaCmd); + Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue); ++(HostAllocaCmd->MLeafCounter); Record->MCurContext = DefaultHostQueue->getContextImplPtr(); } @@ -719,15 +747,19 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // Update linked command if (LinkedAllocaCmd) { - AllocaCmd->addDep(DepDesc{LinkedAllocaCmd, AllocaCmd->getRequirement(), - LinkedAllocaCmd}); + Command *ConnCmd = AllocaCmd->addDep(DepDesc{ + LinkedAllocaCmd, AllocaCmd->getRequirement(), LinkedAllocaCmd}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); LinkedAllocaCmd->addUser(AllocaCmd); LinkedAllocaCmd->MLinkedAllocaCmd = AllocaCmd; // To ensure that the leader allocation is removed first - AllocaCmd->getReleaseCmd()->addDep( + ConnCmd = AllocaCmd->getReleaseCmd()->addDep( DepDesc(LinkedAllocaCmd->getReleaseCmd(), AllocaCmd->getRequirement(), LinkedAllocaCmd)); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); // Device allocation takes ownership of the host ptr during // construction, host allocation doesn't. So, device allocation should @@ -742,17 +774,20 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) { - AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd}); + Command *ConnCmd = + AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd}); + if (ConnCmd) + ToEnqueue.push_back(ConnCmd); Dep->addUser(AllocaCmd); } updateLeaves(Deps, Record, Req->MAccessMode); - addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode); + addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode, ToEnqueue); } } } Record->MAllocaCommands.push_back(AllocaCmd); - Record->MWriteLeaves.push_back(AllocaCmd); + Record->MWriteLeaves.push_back(AllocaCmd, ToEnqueue); ++(AllocaCmd->MLeafCounter); } return AllocaCmd; @@ -780,7 +815,8 @@ typename detail::enable_if_t< EmptyCommand *> Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector &Reqs, const QueueImplPtr &Queue, - Command::BlockReason Reason) { + Command::BlockReason Reason, + std::vector &ToEnqueue) { EmptyCommand *EmptyCmd = new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue()); @@ -792,8 +828,9 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector &Reqs, EmptyCmd->MBlockReason = Reason; for (T *Req : Reqs) { - MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req); - AllocaCommandBase *AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue); + MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue); + AllocaCommandBase *AllocaCmd = + getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); EmptyCmd->addRequirement(Cmd, AllocaCmd, Req); } @@ -805,7 +842,7 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector &Reqs, MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); updateLeaves({Cmd}, Record, Req->MAccessMode); - addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode); + addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); } return EmptyCmd; @@ -843,7 +880,8 @@ static void combineAccessModesOfReqs(std::vector &Reqs) { Command * Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, - QueueImplPtr Queue) { + QueueImplPtr Queue, + std::vector &ToEnqueue) { std::vector &Reqs = CommandGroup->MRequirements; const std::vector &Events = CommandGroup->MEvents; const CG::CGTYPE CGType = CommandGroup->getType(); @@ -872,10 +910,11 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, ? static_cast(NewCmd->getCG()).MQueue : Queue; - Record = getOrInsertMemObjRecord(QueueForAlloca, Req); + Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue); markModifiedIfWrite(Record, Req); - AllocaCmd = getOrCreateAllocaForReq(Record, Req, QueueForAlloca); + AllocaCmd = + getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue); isSameCtx = sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext); @@ -888,7 +927,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // required access mode is valid, remap if not. if (Record->MCurContext->is_host() && !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) - remapMemoryObject(Record, Req, AllocaCmd); + remapMemoryObject(Record, Req, AllocaCmd, ToEnqueue); } else { // Cannot directly copy memory from OpenCL device to OpenCL device - // create two copies: device->host and host->device. @@ -908,14 +947,16 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, if (NeedMemMoveToHost) insertMemoryMove(Record, Req, - Scheduler::getInstance().getDefaultHostQueue()); - insertMemoryMove(Record, Req, MemMoveTargetQueue); + Scheduler::getInstance().getDefaultHostQueue(), + ToEnqueue); + insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } std::set Deps = findDepsForReq(Record, Req, Queue->getContextImplPtr()); for (Command *Dep : Deps) - NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd}); + if (Command *ConnCmd = NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd})) + ToEnqueue.push_back(ConnCmd); } // Set new command as user for dependencies and update leaves. @@ -928,17 +969,19 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, const Requirement *Req = Dep.MDepRequirement; MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); updateLeaves({Dep.MDepCommand}, Record, Req->MAccessMode); - addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode); + addNodeToLeaves(Record, NewCmd.get(), Req->MAccessMode, ToEnqueue); } // Register all the events as dependencies for (detail::EventImplPtr e : Events) { - NewCmd->addDep(e); + if (Command *ConnCmd = NewCmd->addDep(e)) + ToEnqueue.push_back(ConnCmd); } if (CGType == CG::CGTYPE::CODEPLAY_HOST_TASK) - NewCmd->MEmptyCmd = addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, - Queue, Command::BlockReason::HostTask); + NewCmd->MEmptyCmd = + addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, Queue, + Command::BlockReason::HostTask, ToEnqueue); if (MPrintOptionsArray[AfterAddCG]) printGraphAsDot("after_addCG"); @@ -1132,9 +1175,9 @@ void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) { // requirement in Dep we make ConnectCmd depend on DepEvent's command with this // requirement. // Optionality of Dep is set by Dep.MDepCommand equal to nullptr. -void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, - EventImplPtr DepEvent, - const DepDesc &Dep) { +Command *Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, + EventImplPtr DepEvent, + const DepDesc &Dep) { assert(Cmd->getWorkerContext() != DepEvent->getContextImpl()); // construct Host Task type command manually and make it depend on DepEvent @@ -1162,7 +1205,9 @@ void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, if (Dep.MDepRequirement) { // make ConnectCmd depend on requirement - ConnectCmd->addDep(Dep); + // Dismiss the result here as it's not a connection now, + // 'cause ConnectCmd is host one + (void)ConnectCmd->addDep(Dep); assert(reinterpret_cast(DepEvent->getCommand()) == Dep.MDepCommand); // add user to Dep.MDepCommand is already performed beyond this if branch @@ -1170,12 +1215,16 @@ void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj); updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode); - addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode); + std::vector ToEnqueue; + addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode, + ToEnqueue); + assert(ToEnqueue.size() == 0); const std::vector Reqs(1, Dep.MDepRequirement); EmptyCmd = addEmptyCmd(ConnectCmd, Reqs, Scheduler::getInstance().getDefaultHostQueue(), - Command::BlockReason::HostTask); + Command::BlockReason::HostTask, ToEnqueue); + assert(ToEnqueue.size() == 0); // Dependencies for EmptyCmd are set in addEmptyCmd for provided Reqs. // Depend Cmd on empty command @@ -1183,36 +1232,35 @@ void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, DepDesc CmdDep = Dep; CmdDep.MDepCommand = EmptyCmd; - Cmd->addDep(CmdDep); + // Dismiss the result here as it's not a connection now, + // 'cause EmptyCmd is host one + (void)Cmd->addDep(CmdDep); } } else { + std::vector ToEnqueue; EmptyCmd = addEmptyCmd( ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(), - Command::BlockReason::HostTask); + Command::BlockReason::HostTask, ToEnqueue); + assert(ToEnqueue.size() == 0); // There is no requirement thus, empty command will only depend on // ConnectCmd via its event. - EmptyCmd->addDep(ConnectCmd->getEvent()); - ConnectCmd->addDep(DepEvent); + // Dismiss the result here as it's not a connection now, + // 'cause ConnectCmd is host one. + (void)EmptyCmd->addDep(ConnectCmd->getEvent()); + (void)ConnectCmd->addDep(DepEvent); // Depend Cmd on empty command - Cmd->addDep(EmptyCmd->getEvent()); + // Dismiss the result here as it's not a connection now, + // 'cause EmptyCmd is host one + (void)Cmd->addDep(EmptyCmd->getEvent()); } EmptyCmd->addUser(Cmd); ConnectCmd->MEmptyCmd = EmptyCmd; - // FIXME graph builder shouldn't really enqueue commands. We're in the middle - // of enqueue process for some command Cmd. We're going to add a dependency - // for it. Need some nice and cute solution to enqueue ConnectCmd via standard - // scheduler/graph processor mechanisms. - // Though, we need this call to enqueue to launch ConnectCmd. - EnqueueResultT Res; - bool Enqueued = Scheduler::GraphProcessor::enqueueCommand(ConnectCmd, Res); - if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) - throw runtime_error("Failed to enqueue a sync event between two contexts", - PI_INVALID_OPERATION); + return ConnectCmd; } } // namespace detail diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 0e7a3fbbbaa0f..cc2e0cb15067c 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -36,7 +36,9 @@ Scheduler::GraphProcessor::getWaitList(EventImplPtr Event) { return Result; } -void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event) { +void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event, + ReadLockT &GraphReadLock, + bool LockTheLock) { Command *Cmd = getCommand(Event); // Command can be nullptr if user creates cl::sycl::event explicitly or the // event has been waited on by another thread @@ -49,7 +51,13 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event) { // TODO: Reschedule commands. throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); - Cmd->getEvent()->waitInternal(); + assert(Cmd->getEvent() == Event); + + GraphReadLock.unlock(); + Event->waitInternal(); + + if (LockTheLock) + GraphReadLock.lock(); } bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, @@ -87,6 +95,19 @@ bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, return false; } + // Only graph read lock is to be held here. + // Enqueue process of a command may last quite a time. Having graph locked can + // introduce some thread starving (i.e. when the other thread attempts to + // acquire write lock and add a command to graph). Releasing read lock without + // other safety measures isn't an option here as the other thread could go + // into graph cleanup process (due to some event complete) and remove some + // dependencies from dependencies of the user of this command. + // An example: command A depends on commands B and C. This thread wants to + // enqueue A. Hence, it needs to enqueue B and C. So this thread gets into + // dependency list and starts enqueueing B right away. The other thread waits + // on completion of C and starts cleanup process. This thread is still in the + // middle of enqueue of B. The other thread modifies dependency list of A by + // removing C out of it. Iterators become invalid. return Cmd->enqueue(EnqueueResult, Blocking); } diff --git a/sycl/source/detail/scheduler/leaves_collection.cpp b/sycl/source/detail/scheduler/leaves_collection.cpp index 051a7d61fe333..0ae0bcfbb9c0e 100644 --- a/sycl/source/detail/scheduler/leaves_collection.cpp +++ b/sycl/source/detail/scheduler/leaves_collection.cpp @@ -49,13 +49,14 @@ size_t LeavesCollection::remove(value_type Cmd) { return eraseHostAccessorCommand(static_cast(Cmd)); } -bool LeavesCollection::push_back(value_type Cmd) { +bool LeavesCollection::push_back(value_type Cmd, EnqueueListT &ToEnqueue) { bool Result = false; if (isHostAccessorCmd(Cmd)) - Result = addHostAccessorCommand(static_cast(Cmd)); + Result = + addHostAccessorCommand(static_cast(Cmd), ToEnqueue); else - Result = addGenericCommand(Cmd); + Result = addGenericCommand(Cmd, ToEnqueue); return Result; } @@ -72,7 +73,8 @@ std::vector LeavesCollection::toVector() const { return Result; } -bool LeavesCollection::addHostAccessorCommand(EmptyCommand *Cmd) { +bool LeavesCollection::addHostAccessorCommand(EmptyCommand *Cmd, + EnqueueListT &ToEnqueue) { // 1. find the oldest command with doOverlap() = true amongst the List // => OldCmd HostAccessorCommandSingleXRefT OldCmdIt; @@ -97,7 +99,7 @@ bool LeavesCollection::addHostAccessorCommand(EmptyCommand *Cmd) { // when circular buffer is full. if (OldCmdIt != MHostAccessorCommands.end()) { // allocate dependency - MAllocateDependency(Cmd, *OldCmdIt, MRecord); + MAllocateDependency(Cmd, *OldCmdIt, MRecord, ToEnqueue); // erase the old cmd as it's tracked via dependency now eraseHostAccessorCommand(static_cast(*OldCmdIt)); @@ -109,7 +111,8 @@ bool LeavesCollection::addHostAccessorCommand(EmptyCommand *Cmd) { return true; } -bool LeavesCollection::addGenericCommand(Command *Cmd) { +bool LeavesCollection::addGenericCommand(Command *Cmd, + EnqueueListT &ToEnqueue) { if (MGenericCommands.full()) { Command *OldLeaf = MGenericCommands.front(); @@ -117,7 +120,7 @@ bool LeavesCollection::addGenericCommand(Command *Cmd) { if (OldLeaf == Cmd) return false; - MAllocateDependency(Cmd, OldLeaf, MRecord); + MAllocateDependency(Cmd, OldLeaf, MRecord, ToEnqueue); } MGenericCommands.push_back(Cmd); diff --git a/sycl/source/detail/scheduler/leaves_collection.hpp b/sycl/source/detail/scheduler/leaves_collection.hpp index 5ebadddbf9698..54b162693355a 100644 --- a/sycl/source/detail/scheduler/leaves_collection.hpp +++ b/sycl/source/detail/scheduler/leaves_collection.hpp @@ -39,10 +39,11 @@ class LeavesCollection { public: using GenericCommandsT = CircularBuffer; using HostAccessorCommandsT = std::list; + using EnqueueListT = std::vector; // Make first command depend on the second using AllocateDependencyF = - std::function; + std::function; template class IteratorT; @@ -81,7 +82,7 @@ class LeavesCollection { } /// Returns true if insertion took place. Returns false otherwise. - bool push_back(value_type Cmd); + bool push_back(value_type Cmd, EnqueueListT &ToEnqueue); /// Replacement for std::remove with subsequent call to erase(newEnd, end()). /// This function is introduced here due to complexity of iterator. @@ -125,8 +126,8 @@ class LeavesCollection { AllocateDependencyF MAllocateDependency; - bool addGenericCommand(value_type Cmd); - bool addHostAccessorCommand(EmptyCommand *Cmd); + bool addGenericCommand(value_type Cmd, EnqueueListT &ToEnqueue); + bool addHostAccessorCommand(EmptyCommand *Cmd, EnqueueListT &ToEnqueue); // inserts a command to the end of list for its mem object void insertHostAccessorCommand(EmptyCommand *Cmd); diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 995b6e2a13ac8..1f646ff6acf5b 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -26,7 +26,8 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { -void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { +void Scheduler::waitForRecordToFinish(MemObjRecord *Record, + ReadLockT &GraphReadLock) { #ifdef XPTI_ENABLE_INSTRUMENTATION // Will contain the list of dependencies for the Release Command std::set DepCommands; @@ -40,7 +41,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { // Capture the dependencies DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent()); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock); } for (Command *Cmd : Record->MWriteLeaves) { EnqueueResultT Res; @@ -50,7 +51,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { #ifdef XPTI_ENABLE_INSTRUMENTATION DepCommands.insert(Cmd); #endif - GraphProcessor::waitForEvent(Cmd->getEvent()); + GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock); } for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { Command *ReleaseCmd = AllocaCmd->getReleaseCmd(); @@ -63,7 +64,7 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { // reported as edges ReleaseCmd->resolveReleaseDependencies(DepCommands); #endif - GraphProcessor::waitForEvent(ReleaseCmd->getEvent()); + GraphProcessor::waitForEvent(ReleaseCmd->getEvent(), GraphReadLock); } } @@ -71,6 +72,7 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue) { EventImplPtr NewEvent = nullptr; const bool IsKernel = CommandGroup->getType() == CG::KERNEL; + std::vector AuxiliaryCmds; const bool IsHostKernel = CommandGroup->getType() == CG::RUN_ON_HOST_INTEL; vector_class Streams; @@ -87,31 +89,36 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } { - std::unique_lock Lock(MGraphLock, std::defer_lock); - lockSharedTimedMutex(Lock); + WriteLockT Lock(MGraphLock, std::defer_lock); + acquireWriteLock(Lock); Command *NewCmd = nullptr; switch (CommandGroup->getType()) { case CG::UPDATE_HOST: NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), - DefaultHostQueue); + DefaultHostQueue, AuxiliaryCmds); break; case CG::CODEPLAY_HOST_TASK: - NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue); + NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue, + AuxiliaryCmds); break; default: - NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue)); + NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue), + AuxiliaryCmds); } NewEvent = NewCmd->getEvent(); } { - std::shared_lock Lock(MGraphLock); + ReadLockT Lock(MGraphLock); Command *NewCmd = static_cast(NewEvent->getCommand()); + EnqueueResultT Res; + bool Enqueued; + auto CleanUp = [&]() { - if (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0) { + if (NewCmd && (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0)) { if (IsHostKernel) static_cast(NewCmd)->releaseCG(); @@ -120,6 +127,20 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } }; + for (Command *Cmd : AuxiliaryCmds) { + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + try { + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Auxiliary enqueue process failed.", + PI_INVALID_OPERATION); + } catch (...) { + // enqueueCommand() func and if statement above may throw an exception, + // so destroy required resources to avoid memory leak + CleanUp(); + std::rethrow_exception(std::current_exception()); + } + } + if (NewCmd) { // TODO: Check if lazy mode. EnqueueResultT Res; @@ -150,17 +171,30 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, } EventImplPtr Scheduler::addCopyBack(Requirement *Req) { - std::unique_lock Lock(MGraphLock, std::defer_lock); - lockSharedTimedMutex(Lock); - Command *NewCmd = MGraphBuilder.addCopyBack(Req); - // Command was not creted because there were no operations with - // buffer. - if (!NewCmd) - return nullptr; + std::vector AuxiliaryCmds; + Command *NewCmd = nullptr; + { + WriteLockT Lock(MGraphLock, std::defer_lock); + acquireWriteLock(Lock); + NewCmd = MGraphBuilder.addCopyBack(Req, AuxiliaryCmds); + // Command was not creted because there were no operations with + // buffer. + if (!NewCmd) + return nullptr; + } try { + ReadLockT Lock(MGraphLock); EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + bool Enqueued; + + for (Command *Cmd : AuxiliaryCmds) { + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); + } + + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } catch (...) { @@ -174,13 +208,15 @@ Scheduler &Scheduler::getInstance() { } std::vector Scheduler::getWaitList(EventImplPtr Event) { - std::shared_lock Lock(MGraphLock); + ReadLockT Lock(MGraphLock); return GraphProcessor::getWaitList(std::move(Event)); } void Scheduler::waitForEvent(EventImplPtr Event) { - std::shared_lock Lock(MGraphLock); - GraphProcessor::waitForEvent(std::move(Event)); + ReadLockT Lock(MGraphLock); + // It's fine to leave the lock unlocked upon return from waitForEvent as + // there's no more actions to do here with graph + GraphProcessor::waitForEvent(std::move(Event), Lock, /*LockTheLock=*/false); } static void deallocateStreams( @@ -203,8 +239,7 @@ void Scheduler::cleanupFinishedCommands(EventImplPtr FinishedEvent) { // Avoiding deadlock situation, where one thread is in the process of // enqueueing (with a locked mutex) a currently blocked task that waits for // another thread which is stuck at attempting cleanup. - std::unique_lock Lock(MGraphLock, - std::try_to_lock); + WriteLockT Lock(MGraphLock, std::try_to_lock); if (Lock.owns_lock()) { auto FinishedCmd = static_cast(FinishedEvent->getCommand()); // The command might have been cleaned up (and set to nullptr) by another @@ -222,12 +257,13 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { // objects, this is needed to guarantee that streamed data is printed and // resources are released. std::vector> StreamsToDeallocate; + { MemObjRecord *Record = nullptr; - std::unique_lock Lock(MGraphLock, std::defer_lock); + WriteLockT Lock(MGraphLock, std::defer_lock); { - lockSharedTimedMutex(Lock); + acquireWriteLock(Lock); Record = MGraphBuilder.getMemObjRecord(MemObj); if (!Record) @@ -240,12 +276,12 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { { // This only needs a shared mutex as it only involves enqueueing and // awaiting for events - std::shared_lock Lock(MGraphLock); - waitForRecordToFinish(Record); + ReadLockT Lock(MGraphLock); + waitForRecordToFinish(Record, Lock); } { - lockSharedTimedMutex(Lock); + acquireWriteLock(Lock); MGraphBuilder.decrementLeafCountersForRecord(Record); MGraphBuilder.cleanupCommandsForRecord(Record, StreamsToDeallocate); MGraphBuilder.removeRecordForMemObj(MemObj); @@ -255,24 +291,42 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { } EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { - std::unique_lock Lock(MGraphLock, std::defer_lock); - lockSharedTimedMutex(Lock); + std::vector AuxiliaryCmds; + Command *NewCmd = nullptr; + + { + WriteLockT Lock(MGraphLock, std::defer_lock); + acquireWriteLock(Lock); - Command *NewCmd = MGraphBuilder.addHostAccessor(Req); + NewCmd = MGraphBuilder.addHostAccessor(Req, AuxiliaryCmds); + } if (!NewCmd) return nullptr; - EnqueueResultT Res; - bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); - if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) - throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); + + { + ReadLockT ReadLock(MGraphLock); + EnqueueResultT Res; + bool Enqueued; + + for (Command *Cmd : AuxiliaryCmds) { + Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); + } + + Enqueued = GraphProcessor::enqueueCommand(NewCmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); + } + return NewCmd->getEvent(); } void Scheduler::releaseHostAccessor(Requirement *Req) { Command *const BlockedCmd = Req->MBlockedCmd; - std::shared_lock Lock(MGraphLock); + ReadLockT Lock(MGraphLock); assert(BlockedCmd && "Can't find appropriate command to unblock"); @@ -292,6 +346,7 @@ void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) { throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); } }; + EnqueueLeaves(Record->MReadLeaves); EnqueueLeaves(Record->MWriteLeaves); } @@ -336,8 +391,7 @@ Scheduler::~Scheduler() { } } -void Scheduler::lockSharedTimedMutex( - std::unique_lock &Lock) { +void Scheduler::acquireWriteLock(WriteLockT &Lock) { #ifdef _WIN32 // Avoiding deadlock situation for MSVC. std::shared_timed_mutex specification // does not specify a priority for shared and exclusive accesses. It will be a diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 917fc0e1a3ee3..5193d61858849 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -450,12 +450,17 @@ class Scheduler { ~Scheduler(); protected: + // TODO: after switching to C++17, change std::shared_timed_mutex to + // std::shared_mutex + using RWLockT = std::shared_timed_mutex; + using ReadLockT = std::shared_lock; + using WriteLockT = std::unique_lock; + /// Provides exclusive access to std::shared_timed_mutex object with deadlock /// avoidance /// - /// \param Lock is an instance of std::unique_lock - /// class - void lockSharedTimedMutex(std::unique_lock &Lock); + /// \param Lock is an instance of WriteLockT, created with \c std::defer_lock + void acquireWriteLock(WriteLockT &Lock); static void enqueueLeavesOfReqUnlocked(const Requirement *const Req); @@ -474,25 +479,27 @@ class Scheduler { /// \sa queue::submit, Scheduler::addCG /// /// \return a command that represents command group execution. - Command *addCG(std::unique_ptr CommandGroup, - QueueImplPtr Queue); + Command *addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue, + std::vector &ToEnqueue); /// Registers a \ref CG "command group" that updates host memory to the /// latest state. /// /// \return a command that represents command group execution. Command *addCGUpdateHost(std::unique_ptr CommandGroup, - QueueImplPtr HostQueue); + QueueImplPtr HostQueue, + std::vector &ToEnqueue); /// Enqueues a command to update memory to the latest state. /// /// \param Req is a requirement, that describes memory object. - Command *addCopyBack(Requirement *Req); + Command *addCopyBack(Requirement *Req, std::vector &ToEnqueue); /// Enqueues a command to create a host accessor. /// /// \param Req points to memory being accessed. - Command *addHostAccessor(Requirement *Req); + Command *addHostAccessor(Requirement *Req, + std::vector &ToEnqueue); /// [Provisional] Optimizes the whole graph. void optimize(); @@ -522,7 +529,8 @@ class Scheduler { /// \return a pointer to MemObjRecord for pointer to memory object. If the /// record is not found, nullptr is returned. MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue, - const Requirement *Req); + const Requirement *Req, + std::vector &ToEnqueue); /// Decrements leaf counters for all leaves of the record. void decrementLeafCountersForRecord(MemObjRecord *Record); @@ -537,7 +545,8 @@ class Scheduler { /// Adds new command to leaves if needed. void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, - access::mode AccessMode); + access::mode AccessMode, + std::vector &ToEnqueue); /// Removes commands from leaves. void updateLeaves(const std::set &Cmds, MemObjRecord *Record, @@ -547,10 +556,11 @@ class Scheduler { /// \param Cmd dependant command /// \param DepEvent event to depend on /// \param Dep optional DepDesc to perform connection properly + /// \returns the connecting command which is to be enqueued /// /// Optionality of Dep is set by Dep.MDepCommand equal to nullptr. - void connectDepEvent(Command *const Cmd, EventImplPtr DepEvent, - const DepDesc &Dep); + Command *connectDepEvent(Command *const Cmd, EventImplPtr DepEvent, + const DepDesc &Dep); std::vector MMemObjs; @@ -565,16 +575,19 @@ class Scheduler { /// \param Req is a Requirement describing destination. /// \param Queue is a queue that is bound to target context. Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req, - const QueueImplPtr &Queue); + const QueueImplPtr &Queue, + std::vector &ToEnqueue); // Inserts commands required to remap the memory object to its current host // context so that the required access mode becomes valid. Command *remapMemoryObject(MemObjRecord *Record, Requirement *Req, - AllocaCommandBase *HostAllocaCmd); + AllocaCommandBase *HostAllocaCmd, + std::vector &ToEnqueue); UpdateHostRequirementCommand * insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req, - const QueueImplPtr &Queue); + const QueueImplPtr &Queue, + std::vector &ToEnqueue); /// Finds dependencies for the requirement. std::set findDepsForReq(MemObjRecord *Record, @@ -586,7 +599,8 @@ class Scheduler { std::is_same, Requirement>::value, EmptyCommand *> addEmptyCmd(Command *Cmd, const std::vector &Req, - const QueueImplPtr &Queue, Command::BlockReason Reason); + const QueueImplPtr &Queue, Command::BlockReason Reason, + std::vector &ToEnqueue); protected: /// Finds a command dependency corresponding to the record. @@ -605,9 +619,10 @@ class Scheduler { /// Searches for suitable alloca in memory record. /// /// If none found, creates new one. - AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record, - const Requirement *Req, - QueueImplPtr Queue); + AllocaCommandBase * + getOrCreateAllocaForReq(MemObjRecord *Record, const Requirement *Req, + QueueImplPtr Queue, + std::vector &ToEnqueue); void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req); @@ -711,22 +726,38 @@ class Scheduler { static std::vector getWaitList(EventImplPtr Event); /// Waits for the command, associated with Event passed, is completed. - static void waitForEvent(EventImplPtr Event); + /// \param GraphReadLock read-lock which is already acquired for reading + /// \param LockTheLock selects if graph lock should be locked upon return + /// + /// The function may unlock and lock GraphReadLock as needed. Upon return + /// the lock is left in locked state if and only if LockTheLock is true. + static void waitForEvent(EventImplPtr Event, ReadLockT &GraphReadLock, + bool LockTheLock = true); /// Enqueues the command and all its dependencies. /// /// \param EnqueueResult is set to specific status if enqueue failed. + /// \param GraphReadLock read-lock which is already acquired for reading /// \return true if the command is successfully enqueued. + /// + /// The function may unlock and lock GraphReadLock as needed. Upon return + /// the lock is left in locked state. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, BlockingT Blocking = NON_BLOCKING); }; - void waitForRecordToFinish(MemObjRecord *Record); + /// This function waits on all of the graph leaves which somehow use the + /// memory object which is represented by \c Record. The function is called + /// upon destruction of memory buffer. + /// \param Record memory record to await graph leaves of to finish + /// \param GraphReadLock locked graph read lock + /// + /// GraphReadLock will be unlocked/locked as needed. Upon return from the + /// function, GraphReadLock will be left in locked state. + void waitForRecordToFinish(MemObjRecord *Record, ReadLockT &GraphReadLock); GraphBuilder MGraphBuilder; - // TODO: after switching to C++17, change std::shared_timed_mutex to - // std::shared_mutex - std::shared_timed_mutex MGraphLock; + RWLockT MGraphLock; QueueImplPtr DefaultHostQueue; diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp index 129bab848cb3b..33348079e0abf 100644 --- a/sycl/unittests/scheduler/AllocaLinking.cpp +++ b/sycl/unittests/scheduler/AllocaLinking.cpp @@ -68,11 +68,13 @@ TEST_F(SchedulerTest, AllocaLinking) { buffer Buf(range<1>(1)); detail::Requirement Req = getMockRequirement(Buf); - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); detail::AllocaCommandBase *NonHostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue); + MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); EXPECT_FALSE(HostAllocaCmd->MLinkedAllocaCmd); EXPECT_FALSE(NonHostAllocaCmd->MLinkedAllocaCmd); @@ -83,11 +85,13 @@ TEST_F(SchedulerTest, AllocaLinking) { range<1>(1), {ext::oneapi::property::buffer::use_pinned_host_memory()}); detail::Requirement Req = getMockRequirement(Buf); - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); detail::AllocaCommandBase *NonHostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue); + MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd); EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd); @@ -98,11 +102,13 @@ TEST_F(SchedulerTest, AllocaLinking) { buffer Buf(range<1>(1)); detail::Requirement Req = getMockRequirement(Buf); - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); detail::AllocaCommandBase *NonHostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue); + MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd); EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd); diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp index c447a8f1e606b..967b3ee75531c 100644 --- a/sycl/unittests/scheduler/BlockedCommands.cpp +++ b/sycl/unittests/scheduler/BlockedCommands.cpp @@ -19,6 +19,8 @@ TEST_F(SchedulerTest, BlockedCommands) { MockCmd.MIsBlockable = true; MockCmd.MRetVal = CL_DEVICE_PARTITION_EQUALLY; + MockScheduler MS; + auto Lock = MS.acquireGraphReadLock(); detail::EnqueueResultT Res; bool Enqueued = MockScheduler::enqueueCommand(&MockCmd, Res, detail::NON_BLOCKING); @@ -84,6 +86,8 @@ TEST_F(SchedulerTest, DontEnqueueDepsIfOneOfThemIsBlocked) { EXPECT_CALL(C, enqueue(_, _)).Times(0); EXPECT_CALL(D, enqueue(_, _)).Times(0); + MockScheduler MS; + auto Lock = MS.acquireGraphReadLock(); detail::EnqueueResultT Res; bool Enqueued = MockScheduler::enqueueCommand(&A, Res, detail::NON_BLOCKING); ASSERT_FALSE(Enqueued) << "Blocked command should not be enqueued\n"; @@ -112,6 +116,8 @@ TEST_F(SchedulerTest, EnqueueBlockedCommandEarlyExit) { EXPECT_CALL(A, enqueue(_, _)).Times(0); EXPECT_CALL(B, enqueue(_, _)).Times(0); + MockScheduler MS; + auto Lock = MS.acquireGraphReadLock(); detail::EnqueueResultT Res; bool Enqueued = MockScheduler::enqueueCommand(&A, Res, detail::NON_BLOCKING); ASSERT_FALSE(Enqueued) << "Blocked command should not be enqueued\n"; @@ -148,7 +154,7 @@ TEST_F(SchedulerTest, EnqueueHostDependency) { new cl::sycl::detail::event_impl(detail::getSyclObjImpl(MQueue))}; DepEvent->setCommand(&B); - A.addDep(DepEvent); + (void)A.addDep(DepEvent); // We have such a "graph": // @@ -163,6 +169,8 @@ TEST_F(SchedulerTest, EnqueueHostDependency) { EXPECT_CALL(A, enqueue(_, _)).Times(1); EXPECT_CALL(B, enqueue(_, _)).Times(1); + MockScheduler MS; + auto Lock = MS.acquireGraphReadLock(); detail::EnqueueResultT Res; bool Enqueued = MockScheduler::enqueueCommand(&A, Res, detail::NON_BLOCKING); ASSERT_TRUE(Enqueued) << "The command should be enqueued\n"; diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp index 02c9f9ba1d865..37a7a71a4afdc 100644 --- a/sycl/unittests/scheduler/FailedCommands.cpp +++ b/sycl/unittests/scheduler/FailedCommands.cpp @@ -16,10 +16,12 @@ TEST_F(SchedulerTest, FailedDependency) { MockCommand MDep(detail::getSyclObjImpl(MQueue)); MockCommand MUser(detail::getSyclObjImpl(MQueue)); MDep.addUser(&MUser); - MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}); + (void)MUser.addDep(detail::DepDesc{&MDep, &MockReq, nullptr}); MUser.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueReady; MDep.MEnqueueStatus = detail::EnqueueResultT::SyclEnqueueFailed; + MockScheduler MS; + auto Lock = MS.acquireGraphReadLock(); detail::EnqueueResultT Res; bool Enqueued = MockScheduler::enqueueCommand(&MUser, Res, detail::NON_BLOCKING); diff --git a/sycl/unittests/scheduler/FinishedCmdCleanup.cpp b/sycl/unittests/scheduler/FinishedCmdCleanup.cpp index abafde6cd2c5a..34401108187e1 100644 --- a/sycl/unittests/scheduler/FinishedCmdCleanup.cpp +++ b/sycl/unittests/scheduler/FinishedCmdCleanup.cpp @@ -21,8 +21,9 @@ TEST_F(SchedulerTest, FinishedCmdCleanup) { detail::Requirement MockReqA = getMockRequirement(BufA); detail::Requirement MockReqB = getMockRequirement(BufB); detail::Requirement MockReqC = getMockRequirement(BufC); - detail::MemObjRecord *RecC = - MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(MQueue), &MockReqC); + std::vector AuxCmds; + detail::MemObjRecord *RecC = MS.getOrInsertMemObjRecord( + detail::getSyclObjImpl(MQueue), &MockReqC, AuxCmds); // Create a graph and check that all inner nodes have been deleted and // their users have had the corresponding dependency replaced with a @@ -59,13 +60,15 @@ TEST_F(SchedulerTest, FinishedCmdCleanup) { detail::getSyclObjImpl(MQueue), MockReqA, Callback); addEdge(InnerC, &AllocaA, &AllocaA); + std::vector ToEnqueue; + MockCommand LeafB{detail::getSyclObjImpl(MQueue), MockReqB}; addEdge(&LeafB, &AllocaB, &AllocaB); - MS.addNodeToLeaves(RecC, &LeafB); + MS.addNodeToLeaves(RecC, &LeafB, access::mode::read, ToEnqueue); MockCommand LeafA{detail::getSyclObjImpl(MQueue), MockReqA}; addEdge(&LeafA, InnerC, &AllocaA); - MS.addNodeToLeaves(RecC, &LeafA); + MS.addNodeToLeaves(RecC, &LeafA, access::mode::read, ToEnqueue); MockCommand *InnerB = new MockCommandWithCallback( detail::getSyclObjImpl(MQueue), MockReqB, Callback); diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index cec7c1772852c..14c8a802d1772 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -112,18 +112,21 @@ TEST_F(SchedulerTest, InOrderQueueDeps) { buffer Buf(&val, range<1>(1)); detail::Requirement Req = getMockRequirement(Buf); + std::vector AuxCmds; detail::MemObjRecord *Record = - MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req); - MS.getOrCreateAllocaForReq(Record, &Req, InOrderQueueImpl); - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue); + MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, InOrderQueueImpl, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); // Check that sequential memory movements submitted to the same in-order // queue do not depend on each other. - detail::Command *Cmd = MS.insertMemoryMove(Record, &Req, DefaultHostQueue); + detail::Command *Cmd = + MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds); detail::EnqueueResultT Res; + auto ReadLock = MS.acquireGraphReadLock(); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); - Cmd = MS.insertMemoryMove(Record, &Req, InOrderQueueImpl); + Cmd = MS.insertMemoryMove(Record, &Req, InOrderQueueImpl, AuxCmds); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); - Cmd = MS.insertMemoryMove(Record, &Req, DefaultHostQueue); + Cmd = MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); } diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index d840099a0048d..ffed74ba0e1ec 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -29,8 +29,9 @@ TEST_F(SchedulerTest, LeafLimit) { MockDepCmd = std::make_unique(detail::getSyclObjImpl(MQueue), MockReq); - detail::MemObjRecord *Rec = - MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(MQueue), &MockReq); + std::vector AuxCmds; + detail::MemObjRecord *Rec = MS.getOrInsertMemObjRecord( + detail::getSyclObjImpl(MQueue), &MockReq, AuxCmds); // Create commands that will be added as leaves exceeding the limit by 1 for (std::size_t i = 0; i < Rec->MWriteLeaves.genericCommandsCapacity() + 1; @@ -41,12 +42,13 @@ TEST_F(SchedulerTest, LeafLimit) { // Create edges: all soon-to-be leaves are direct users of MockDep for (auto &Leaf : LeavesToAdd) { MockDepCmd->addUser(Leaf.get()); - Leaf->addDep( + (void)Leaf->addDep( detail::DepDesc{MockDepCmd.get(), Leaf->getRequirement(), nullptr}); } + std::vector ToEnqueue; // Add edges as leaves and exceed the leaf limit for (auto &LeafPtr : LeavesToAdd) { - MS.addNodeToLeaves(Rec, LeafPtr.get()); + MS.addNodeToLeaves(Rec, LeafPtr.get(), access::mode::write, ToEnqueue); } // Check that the oldest leaf has been removed from the leaf list // and added as a dependency of the newest one instead diff --git a/sycl/unittests/scheduler/LeavesCollection.cpp b/sycl/unittests/scheduler/LeavesCollection.cpp index ee5a3952cb0c6..a731f960c0c4a 100644 --- a/sycl/unittests/scheduler/LeavesCollection.cpp +++ b/sycl/unittests/scheduler/LeavesCollection.cpp @@ -51,8 +51,13 @@ TEST_F(LeavesCollectionTest, PushBack) { size_t TimesGenericWasFull; + std::vector ToEnqueue; + LeavesCollection::AllocateDependencyF AllocateDependency = - [&](Command *, Command *, MemObjRecord *) { ++TimesGenericWasFull; }; + [&](Command *, Command *, MemObjRecord *, + std::vector &) { + ++TimesGenericWasFull; + }; // add only generic commands { @@ -65,7 +70,7 @@ TEST_F(LeavesCollectionTest, PushBack) { for (size_t Idx = 0; Idx < GenericCmdsCapacity * 2; ++Idx) { Cmds.push_back(createGenericCommand(getSyclObjImpl(MQueue))); - LE.push_back(Cmds.back().get()); + LE.push_back(Cmds.back().get(), ToEnqueue); } ASSERT_EQ(TimesGenericWasFull, GenericCmdsCapacity) @@ -95,7 +100,7 @@ TEST_F(LeavesCollectionTest, PushBack) { : createEmptyCommand(getSyclObjImpl(MQueue), MockReq); Cmds.push_back(Cmd); - LE.push_back(Cmds.back().get()); + LE.push_back(Cmds.back().get(), ToEnqueue); } ASSERT_EQ(TimesGenericWasFull, GenericCmdsCapacity) @@ -112,8 +117,11 @@ TEST_F(LeavesCollectionTest, PushBack) { TEST_F(LeavesCollectionTest, Remove) { static constexpr size_t GenericCmdsCapacity = 8; + std::vector ToEnqueue; + LeavesCollection::AllocateDependencyF AllocateDependency = - [](Command *, Command *Old, MemObjRecord *) { --Old->MLeafCounter; }; + [](Command *, Command *Old, MemObjRecord *, + std::vector &) { --Old->MLeafCounter; }; { cl::sycl::buffer Buf(cl::sycl::range<1>(1)); @@ -129,7 +137,7 @@ TEST_F(LeavesCollectionTest, Remove) { : createEmptyCommand(getSyclObjImpl(MQueue), MockReq); Cmds.push_back(Cmd); - if (LE.push_back(Cmds.back().get())) + if (LE.push_back(Cmds.back().get(), ToEnqueue)) ++Cmd->MLeafCounter; } diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp index 6056c94b445b2..aa10bb446c32c 100644 --- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp +++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp @@ -68,7 +68,8 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { /*PropList=*/{})); auto AllocaDep = [](cl::sycl::detail::Command *, cl::sycl::detail::Command *, - cl::sycl::detail::MemObjRecord *) {}; + cl::sycl::detail::MemObjRecord *, + std::vector &) {}; std::shared_ptr Record{ new cl::sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), @@ -84,11 +85,12 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { MockCommand DepDepCmd(DefaultHostQueue, Req); DepCmd.MDeps.push_back({&DepDepCmd, DepDepCmd.getRequirement(), &AllocaCmd1}); DepDepCmd.MUsers.insert(&DepCmd); - Record->MWriteLeaves.push_back(&DepCmd); + std::vector ToEnqueue; + Record->MWriteLeaves.push_back(&DepCmd, ToEnqueue); MockScheduler MS; cl::sycl::detail::Command *AllocaCmd2 = - MS.getOrCreateAllocaForReq(Record.get(), &Req, Q1); + MS.getOrCreateAllocaForReq(Record.get(), &Req, Q1, ToEnqueue); ASSERT_TRUE(!!AllocaCmd1.MLinkedAllocaCmd) << "No link appeared in existing command"; diff --git a/sycl/unittests/scheduler/MemObjCommandCleanup.cpp b/sycl/unittests/scheduler/MemObjCommandCleanup.cpp index d35ece2454203..17429831f4257 100644 --- a/sycl/unittests/scheduler/MemObjCommandCleanup.cpp +++ b/sycl/unittests/scheduler/MemObjCommandCleanup.cpp @@ -17,8 +17,9 @@ TEST_F(SchedulerTest, MemObjCommandCleanup) { buffer BufB(range<1>(1)); detail::Requirement MockReqA = getMockRequirement(BufA); detail::Requirement MockReqB = getMockRequirement(BufB); - detail::MemObjRecord *RecA = - MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(MQueue), &MockReqA); + std::vector AuxCmds; + detail::MemObjRecord *RecA = MS.getOrInsertMemObjRecord( + detail::getSyclObjImpl(MQueue), &MockReqA, AuxCmds); // Create 2 fake allocas, one of which will be cleaned up detail::AllocaCommand *MockAllocaA = diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp index 9dc561295eb86..ed66cc0c9f60f 100644 --- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp +++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp @@ -89,9 +89,11 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { buffer Buf(&val, range<1>(1)); detail::Requirement Req = getMockRequirement(Buf); - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); detail::AllocaCommandBase *NonHostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); // Both non-host and host allocations should be created in this case in // order to perform a memory move. @@ -102,7 +104,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd); EXPECT_TRUE(Record->MCurContext->is_host()); - detail::Command *MemoryMove = MS.insertMemoryMove(Record, &Req, QImpl); + detail::Command *MemoryMove = + MS.insertMemoryMove(Record, &Req, QImpl, AuxCmds); EXPECT_EQ(MemoryMove->getType(), detail::Command::COPY_MEMORY); } // Check non-host alloca with discard access modes @@ -116,8 +119,10 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // No need to create a host allocation in this case since the data can be // discarded. - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); - MS.getOrCreateAllocaForReq(Record, &DiscardReq, QImpl); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &DiscardReq, QImpl, AuxCmds); EXPECT_EQ(Record->MAllocaCommands.size(), 1U); } // Check non-host alloca without user pointer @@ -127,8 +132,10 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // No need to create a host allocation in this case since there's no data to // initialize the buffer with. - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); EXPECT_EQ(Record->MAllocaCommands.size(), 1U); } // Check host -> non-host alloca @@ -139,18 +146,20 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // No special handling required: alloca commands are created one after // another and the transfer is done via a write operation. + std::vector AuxCmds; detail::MemObjRecord *Record = - MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req); + MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue); + MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); EXPECT_EQ(Record->MAllocaCommands.size(), 1U); detail::AllocaCommandBase *NonHostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); EXPECT_EQ(Record->MAllocaCommands.size(), 2U); EXPECT_TRUE(!HostAllocaCmd->MLinkedAllocaCmd); EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd); - detail::Command *MemoryMove = MS.insertMemoryMove(Record, &Req, QImpl); + detail::Command *MemoryMove = + MS.insertMemoryMove(Record, &Req, QImpl, AuxCmds); EXPECT_EQ(MemoryMove->getType(), detail::Command::COPY_MEMORY); } // Check that memory movement operations work correctly with/after discard @@ -163,13 +172,15 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { detail::Requirement DiscardReq = getMockRequirement(Buf); DiscardReq.MAccessMode = access::mode::discard_read_write; - detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); - MS.getOrCreateAllocaForReq(Record, &Req, QImpl); - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue); + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); // Memory movement operations should be omitted for discard access modes. detail::Command *MemoryMove = - MS.insertMemoryMove(Record, &DiscardReq, DefaultHostQueue); + MS.insertMemoryMove(Record, &DiscardReq, DefaultHostQueue, AuxCmds); EXPECT_TRUE(MemoryMove == nullptr); // The current context for the record should still be modified. EXPECT_EQ(Record->MCurContext, DefaultHostQueue->getContextImplPtr()); diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 304f79319e0b2..ba244fa7c0bb8 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -22,6 +22,14 @@ cl::sycl::detail::Requirement getMockRequirement(); +namespace cl { +namespace sycl { +namespace detail { +class Command; +} // namespace detail +} // namespace sycl +} // namespace cl + class MockCommand : public cl::sycl::detail::Command { public: MockCommand(cl::sycl::detail::QueueImplPtr Queue, @@ -91,8 +99,9 @@ class MockScheduler : public cl::sycl::detail::Scheduler { public: cl::sycl::detail::MemObjRecord * getOrInsertMemObjRecord(const cl::sycl::detail::QueueImplPtr &Queue, - cl::sycl::detail::Requirement *Req) { - return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req); + cl::sycl::detail::Requirement *Req, + std::vector &ToEnqueue) { + return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req, ToEnqueue); } void removeRecordForMemObj(cl::sycl::detail::SYCLMemObjI *MemObj) { @@ -105,10 +114,11 @@ class MockScheduler : public cl::sycl::detail::Scheduler { MGraphBuilder.cleanupCommandsForRecord(Rec, StreamsToDeallocate); } - void addNodeToLeaves( - cl::sycl::detail::MemObjRecord *Rec, cl::sycl::detail::Command *Cmd, - cl::sycl::access::mode Mode = cl::sycl::access::mode::read_write) { - return MGraphBuilder.addNodeToLeaves(Rec, Cmd, Mode); + void addNodeToLeaves(cl::sycl::detail::MemObjRecord *Rec, + cl::sycl::detail::Command *Cmd, + cl::sycl::access::mode Mode, + std::vector &ToEnqueue) { + return MGraphBuilder.addNodeToLeaves(Rec, Cmd, Mode, ToEnqueue); } static bool enqueueCommand(cl::sycl::detail::Command *Cmd, @@ -120,21 +130,26 @@ class MockScheduler : public cl::sycl::detail::Scheduler { cl::sycl::detail::AllocaCommandBase * getOrCreateAllocaForReq(cl::sycl::detail::MemObjRecord *Record, const cl::sycl::detail::Requirement *Req, - cl::sycl::detail::QueueImplPtr Queue) { - return MGraphBuilder.getOrCreateAllocaForReq(Record, Req, Queue); + cl::sycl::detail::QueueImplPtr Queue, + std::vector &ToEnqueue) { + return MGraphBuilder.getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); } + ReadLockT acquireGraphReadLock() { return ReadLockT{MGraphLock}; } + cl::sycl::detail::Command * insertMemoryMove(cl::sycl::detail::MemObjRecord *Record, cl::sycl::detail::Requirement *Req, - const cl::sycl::detail::QueueImplPtr &Queue) { - return MGraphBuilder.insertMemoryMove(Record, Req, Queue); + const cl::sycl::detail::QueueImplPtr &Queue, + std::vector &ToEnqueue) { + return MGraphBuilder.insertMemoryMove(Record, Req, Queue, ToEnqueue); } cl::sycl::detail::Command * addCG(std::unique_ptr CommandGroup, - cl::sycl::detail::QueueImplPtr Queue) { - return MGraphBuilder.addCG(std::move(CommandGroup), Queue); + cl::sycl::detail::QueueImplPtr Queue, + std::vector &ToEnqueue) { + return MGraphBuilder.addCG(std::move(CommandGroup), Queue, ToEnqueue); } }; diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 083fc3eb40630..720247e38c27b 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -128,7 +128,8 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { initStream(Streams[0], HQueueImpl); MockScheduler MS; - detail::Command *NewCmd = MS.addCG(std::move(MainCG), HQueueImpl); + std::vector AuxCmds; + detail::Command *NewCmd = MS.addCG(std::move(MainCG), HQueueImpl, AuxCmds); ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler"; ASSERT_GT(NewCmd->MDeps.size(), 0u) << "No deps appeared in the new exec kernel command"; diff --git a/sycl/unittests/scheduler/utils.cpp b/sycl/unittests/scheduler/utils.cpp index 3e80c485bc458..b6bb23b4325d8 100644 --- a/sycl/unittests/scheduler/utils.cpp +++ b/sycl/unittests/scheduler/utils.cpp @@ -10,7 +10,8 @@ void addEdge(cl::sycl::detail::Command *User, cl::sycl::detail::Command *Dep, cl::sycl::detail::AllocaCommandBase *Alloca) { - User->addDep(cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}); + (void)User->addDep( + cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}); Dep->addUser(User); }