diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index a70f6dde9357e..8bb5a4ffbe3eb 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -14,6 +14,7 @@ #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H +#include "llvm/ADT/SetVector.h" #include "llvm/Frontend/Atomic/Atomic.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" @@ -2279,6 +2280,8 @@ class OpenMPIRBuilder { PostOutlineCBTy PostOutlineCB; BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB; SmallVector ExcludeArgsFromAggregate; + /// For Taskloop, certain Inputs, such as the loop bound information, needs to be ina. fixed position in the aggregate structure to ensure the correct values are picked up by GEP's defined as part of the outlining callback. + SetVector Inputs; /// Collect all blocks in between EntryBB and ExitBB in both the given /// vector and set. diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h index 407eb50d2c7a3..9c509324e3316 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h +++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h @@ -171,7 +171,7 @@ class CodeExtractorAnalysisCache { /// /// \param CEAC - Cache to speed up operations for the CodeExtractor when /// hoisting, and extracting lifetime values and assumes. - /// \param Inputs [out] - filled with values marked as inputs to the + /// \param Inputs [in/out] - filled with values marked as inputs to the /// newly outlined function. /// \param Outputs [out] - filled with values marked as outputs to the /// newly outlined function. diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 7301d7a7e4370..4ab2154d0aa53 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -395,18 +395,19 @@ Value *createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, - const Twine &Name = "", bool AsPtr = true) { + const Twine &Name = "", bool AsPtr = true, IntegerType *IntTy = nullptr) { Builder.restoreIP(OuterAllocaIP); + IntTy = IntTy ? IntTy : Builder.getInt32Ty(); Instruction *FakeVal; AllocaInst *FakeValAddr = - Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr"); + Builder.CreateAlloca(IntTy, nullptr, Name + ".addr"); ToBeDeleted.push_back(FakeValAddr); if (AsPtr) { FakeVal = FakeValAddr; } else { FakeVal = - Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val"); + Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val"); ToBeDeleted.push_back(FakeVal); } @@ -415,10 +416,10 @@ Value *createFakeIntVal(IRBuilderBase &Builder, Instruction *UseFakeVal; if (AsPtr) { UseFakeVal = - Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use"); + Builder.CreateLoad(IntTy, FakeVal, Name + ".use"); } else { UseFakeVal = - cast(Builder.CreateAdd(FakeVal, Builder.getInt32(10))); + cast(Builder.CreateAdd(FakeVal, ConstantInt::get(IntTy, 10))); } ToBeDeleted.push_back(UseFakeVal); return FakeVal; @@ -751,7 +752,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) { for (auto *V : OI.ExcludeArgsFromAggregate) Extractor.excludeArgFromAggregate(V); - Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); + SetVector Outputs; + Function *OutlinedFn = Extractor.extractCodeRegion(CEAC, OI.Inputs, Outputs); // Forward target-cpu, target-features attributes to the outlined function. auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu"); @@ -1979,22 +1981,38 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( OI.ExitBB = TaskloopExitBB; // Add the thread ID argument. - SmallVector ToBeDeleted; + SmallVector ToBeDeleted; // dummy instruction to be used as a fake argument OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false)); + Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, + "lb", /*AsPtr=*/false, Builder.getInt64Ty()); + Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, + "ub", /*AsPtr=*/false, Builder.getInt64Ty()); + Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, + "step", /*AsPtr=*/false, Builder.getInt64Ty()); + // For Taskloop, we want to force the bounds being the first 3 inputs in the aggregate struct*/ + OI.Inputs.insert(FakeLB); + OI.Inputs.insert(FakeUB); + OI.Inputs.insert(FakeStep); OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied, TaskloopAllocaBB, CLI, Loc, - ToBeDeleted](Function &OutlinedFn) mutable { + ToBeDeleted, FakeLB, FakeUB, FakeStep](Function &OutlinedFn) mutable { // Replace the Stale CI by appropriate RTL function call. assert(OutlinedFn.hasOneUse() && "there must be a single user for the outlined function"); CallInst *StaleCI = cast(OutlinedFn.user_back()); - // HasShareds is true if any variables are captured in the outlined region, - // false otherwise. - bool HasShareds = StaleCI->arg_size() > 1; + // Create the casting for the Bounds Values that can be used when outlining to replace the uses of the fakes with real values */ + BasicBlock *CodeReplBB = StaleCI->getParent(); + IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP(); + Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt()); + Value *CastedLBVal = Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64"); + Value *CastedUBVal = Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64"); + Value *CastedStepVal = Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64"); + Builder.restoreIP(CurrentIp); + Builder.SetInsertPoint(StaleCI); // Gather the arguments for emitting the runtime call for @@ -2015,20 +2033,18 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( Value *TaskSize = Builder.getInt64( divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8)); - Value *SharedsSize = Builder.getInt64(0); - if (HasShareds) { - AllocaInst *ArgStructAlloca = - dyn_cast(StaleCI->getArgOperand(1)); - assert(ArgStructAlloca && - "Unable to find the alloca instruction corresponding to arguments " - "for extracted function"); - StructType *ArgStructType = - dyn_cast(ArgStructAlloca->getAllocatedType()); - assert(ArgStructType && "Unable to find struct type corresponding to " - "arguments for extracted function"); - SharedsSize = - Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); - } + Value *Shareds = StaleCI->getArgOperand(1); + AllocaInst *ArgStructAlloca = + dyn_cast(Shareds); + assert(ArgStructAlloca && + "Unable to find the alloca instruction corresponding to arguments " + "for extracted function"); + StructType *ArgStructType = + dyn_cast(ArgStructAlloca->getAllocatedType()); + assert(ArgStructType && "Unable to find struct type corresponding to " + "arguments for extracted function"); + Value *SharedsSize = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); // Emit the @__kmpc_omp_task_alloc runtime call // The runtime call returns a pointer to an area where the task captured @@ -2038,31 +2054,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, /*task_func=*/&OutlinedFn}); + Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); + Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); + Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, + SharedsSize); // Get the pointer to loop lb, ub, step from task ptr // and set up the lowerbound,upperbound and step values - llvm::Value *lb = - Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5); - Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty()); - Builder.CreateStore(LbVal_ext, lb); - - llvm::Value *ub = - Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6); - Value *UbVal_ext = Builder.CreateSExt(UBVal, Builder.getInt64Ty()); - Builder.CreateStore(UbVal_ext, ub); - - llvm::Value *step = - Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7); - Value *Step_ext = Builder.CreateSExt(StepVal, Builder.getInt64Ty()); - Builder.CreateStore(Step_ext, step); - llvm::Value *loadstep = Builder.CreateLoad(Builder.getInt64Ty(), step); + llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0); + Builder.CreateStore(CastedLBVal, Lb); - if (HasShareds) { - Value *Shareds = StaleCI->getArgOperand(1); - Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); - Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); - Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, - SharedsSize); - } + llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1); + Builder.CreateStore(CastedUBVal, Ub); + + llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2); + Builder.CreateStore(CastedStepVal, Step); + llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step); // set up the arguments for emitting kmpc_taskloop runtime call // setting default values for ifval, nogroup, sched, grainsize, task_dup @@ -2074,8 +2080,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( // TODO: Handle the case when TaskDup pointer isn't empty Value *TaskDup = Constant::getNullValue(Builder.getPtrTy()); - Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub, - loadstep, NoGroup, Sched, GrainSize, TaskDup}; + Value *Args[] = {Ident, ThreadID, TaskData, IfVal, Lb, Ub, + Loadstep, NoGroup, Sched, GrainSize, TaskDup}; // taskloop runtime call Function *TaskloopFn = @@ -2091,32 +2097,60 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin()); - if (HasShareds) { - LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1)); - OutlinedFn.getArg(1)->replaceUsesWithIf( - Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); - } + LoadInst *SharedsOutlined = + Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1)); + OutlinedFn.getArg(1)->replaceUsesWithIf( + SharedsOutlined, + [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; }); Value *IV = CLI->getIndVar(); Type *IVTy = IV->getType(); - Constant *One = ConstantInt::get(IVTy, 1); - - Value *TaskLB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, - OutlinedFn.getArg(1), 5, "gep_lb"); - Value *LoadTaskLB = Builder.CreateLoad(Builder.getInt64Ty(), TaskLB); - Value *LowerBound = Builder.CreateTrunc(LoadTaskLB, IVTy, "lb"); - - Value *TaskUB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, - OutlinedFn.getArg(1), 6, "gep_ub"); - Value *LoadTaskUB = Builder.CreateLoad(Builder.getInt64Ty(), TaskUB); - Value *UpperBound = Builder.CreateTrunc(LoadTaskUB, IVTy, "ub"); + Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1); + + // When outlining, CodeExtractor will create GEP's to the LowerBound and + // UpperBound. These GEP's can be reused for loading the tasks respective + // bounds. + Value *TaskLB = nullptr; + Value *TaskUB = nullptr; + Value *LoadTaskLB = nullptr; + Value *LoadTaskUB = nullptr; + for (Instruction &I : *TaskloopAllocaBB) { + if (I.getOpcode() == Instruction::GetElementPtr) { + GetElementPtrInst &Gep = cast(I); + if (Gep.getOperand(0) != SharedsOutlined) + continue; + if (ConstantInt *CI = dyn_cast(Gep.getOperand(2))) { + switch (CI->getZExtValue()) { + case 0: + TaskLB = &I; + break; + case 1: + TaskUB = &I; + break; + } + } + } else if (I.getOpcode() == Instruction::Load) { + LoadInst &Load = cast(I); + if (Load.getPointerOperand() == TaskLB) { + assert(TaskLB != nullptr && "Expected value for TaskLB"); + LoadTaskLB = &I; + } else if (Load.getPointerOperand() == TaskUB) { + assert(TaskUB != nullptr && "Expected value for TaskUB"); + LoadTaskUB = &I; + } + } + } Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); - Value *TripCountMinusOne = Builder.CreateSub(UpperBound, LowerBound); + assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB"); + assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB"); + Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB); Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt"); + Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true); + Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true); // set the trip count in the CLI - CLI->setTripCount(TripCount); + CLI->setTripCount(CastedTripCount); Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt()); @@ -2127,12 +2161,15 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( if (Add->getOpcode() == llvm::Instruction::Add) { if (llvm::isa(Add->getOperand(0))) { // update the starting index of the loop - Add->setOperand(1, LowerBound); + Add->setOperand(1, CastedTaskLB); } } } } + FakeLB->replaceAllUsesWith(CastedLBVal); + FakeUB->replaceAllUsesWith(CastedUBVal); + FakeStep->replaceAllUsesWith(CastedStepVal); for (Instruction *I : llvm::reverse(ToBeDeleted)) { I->eraseFromParent(); }