diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index ba0507528b42e..f7e01089ab5cc 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -416,6 +416,7 @@ void initializeStructurizeCFGPass(PassRegistry&); void initializeSYCLLowerWGScopeLegacyPassPass(PassRegistry &); void initializeSYCLLowerESIMDLegacyPassPass(PassRegistry &); void initializeESIMDLowerLoadStorePass(PassRegistry &); +void initializeESIMDLowerVecArgLegacyPassPass(PassRegistry &); void initializeTailCallElimPass(PassRegistry&); void initializeTailDuplicatePass(PassRegistry&); void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 8f4bcd9e6e75f..c50459347643d 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -204,6 +204,7 @@ namespace { (void)llvm::createSYCLLowerWGScopePass(); (void)llvm::createSYCLLowerESIMDPass(); (void)llvm::createESIMDLowerLoadStorePass(); + (void)llvm::createESIMDLowerVecArgPass(); std::string buf; llvm::raw_string_ostream os(buf); (void) llvm::createPrintModulePass(os); diff --git a/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h b/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h index ac76459edafce..632459c2efd09 100644 --- a/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h +++ b/llvm/include/llvm/SYCLLowerIR/LowerESIMD.h @@ -42,6 +42,9 @@ class ESIMDLowerLoadStorePass : public PassInfoMixin { FunctionPass *createESIMDLowerLoadStorePass(); void initializeESIMDLowerLoadStorePass(PassRegistry &); +ModulePass *createESIMDLowerVecArgPass(); +void initializeESIMDLowerVecArgLegacyPassPass(PassRegistry &); + } // namespace llvm #endif // LLVM_SYCLLOWERIR_LOWERESIMD_H diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 892514384706c..5e6041c4b79c8 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -32,6 +32,7 @@ add_llvm_component_library(LLVMSYCLLowerIR LowerWGScope.cpp LowerESIMD.cpp LowerESIMDVLoadVStore.cpp + LowerESIMDVecArg.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR diff --git a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp new file mode 100644 index 0000000000000..3488d9a9532d9 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp @@ -0,0 +1,320 @@ +//===-- ESIMDVecArgPass.cpp - lower Close To Metal (CM) constructs --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Change in function parameter type from simd* to native llvm vector type for +// cmc compiler to generate correct code for subroutine parameter passing and +// globals: +// +// Old IR: +// ====== +// +// Parameter %0 is of type simd* +// define dso_local spir_func void @_Z3fooPiN2cm3gen4simdIiLi16EEE(i32 +// addrspace(4)* %C, +// "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" * %0) +// local_unnamed_addr #2 { +// +// New IR: +// ====== +// +// Translate simd* parameter (#1) to vector <16 x 32>* type and insert bitcast. +// All users of old parameter will use result of the bitcast. +// +// define dso_local spir_func void @_Z3fooPiN2cm3gen4simdIiLi16EEE(i32 +// addrspace(4)* %C, +// <16 x i32>* %0) local_unnamed_addr #2 { +// entry: +// % 1 = bitcast<16 x i32> * % 0 to % +// "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" * +// +// +// Change in global variables: +// +// Old IR: +// ====== +// @vc = global %"class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" +// zeroinitializer, align 64 #0 +// +// % call.cm.i.i = tail call<16 x i32> @llvm.genx.vload.v16i32.p4v16i32( +// <16 x i32> addrspace(4) * getelementptr( +// % "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd", +// % "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" addrspace(4) * +// addrspacecast(% "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" * @vc to +// % "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" addrspace(4) *), i64 0, +// i32 0)) +// +// New IR: +// ====== +// +// @0 = dso_local global <16 x i32> zeroinitializer, align 64 #0 <-- New Global +// Variable +// +// % call.cm.i.i = tail call<16 x i32> @llvm.genx.vload.v16i32.p4v16i32( +// <16 x i32> addrspace(4) * getelementptr( +// % "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd", +// % "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" addrspace(4) * +// addrspacecast(% "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" * +// bitcast(<16 x i32> * @0 to +// %"class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" *) to % +// "class._ZTSN2cm3gen4simdIiLi16EEE.cm::gen::simd" addrspace(4) *), +// i64 0, i32 0)) +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +#define DEBUG_TYPE "ESIMDLowerVecArg" + +namespace llvm { + +// Forward declarations +void initializeESIMDLowerVecArgLegacyPassPass(PassRegistry &); +ModulePass *createESIMDLowerVecArgPass(); + +// Pass converts simd* function parameters and globals to +// llvm's first-class vector* type. +class ESIMDLowerVecArgPass { +public: + bool run(Module &M); + +private: + DenseMap OldNewGlobal; + + Function *rewriteFunc(Function &F); + Type *getSimdArgPtrTyOrNull(Value *arg); + void fixGlobals(Module &M); + void replaceConstExprWithGlobals(Module &M); + ConstantExpr *createNewConstantExpr(GlobalVariable *newGlobalVar, + Type *oldGlobalType, Value *old); + void removeOldGlobals(); +}; + +} // namespace llvm + +namespace { +class ESIMDLowerVecArgLegacyPass : public ModulePass { +public: + static char ID; + ESIMDLowerVecArgLegacyPass() : ModulePass(ID) { + initializeESIMDLowerVecArgLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + auto Modified = Impl.run(M); + return Modified; + } + + bool doInitialization(Module &M) override { return false; } + +private: + ESIMDLowerVecArgPass Impl; +}; +} // namespace + +char ESIMDLowerVecArgLegacyPass::ID = 0; +INITIALIZE_PASS(ESIMDLowerVecArgLegacyPass, "ESIMDLowerVecArg", + "Translate simd ptr to native vector type", false, false) + +// Public interface to VecArgPass +ModulePass *llvm::createESIMDLowerVecArgPass() { + return new ESIMDLowerVecArgLegacyPass(); +} + +// Return ptr to first-class vector type if Value is a simd*, else return +// nullptr. +Type *ESIMDLowerVecArgPass::getSimdArgPtrTyOrNull(Value *arg) { + auto ArgType = dyn_cast(arg->getType()); + if (!ArgType || !ArgType->getElementType()->isStructTy()) + return nullptr; + auto ContainedType = ArgType->getElementType(); + if ((ContainedType->getStructNumElements() != 1) || + !ContainedType->getStructElementType(0)->isVectorTy()) + return nullptr; + return PointerType::get(ContainedType->getStructElementType(0), + ArgType->getPointerAddressSpace()); +} + +// F may have multiple arguments of type simd*. This +// function updates all parameters along with call +// call sites of F. +Function *ESIMDLowerVecArgPass::rewriteFunc(Function &F) { + FunctionType *FTy = F.getFunctionType(); + Type *RetTy = FTy->getReturnType(); + SmallVector ArgTys; + + for (unsigned int i = 0; i != F.arg_size(); i++) { + auto Arg = F.getArg(i); + Type *NewTy = getSimdArgPtrTyOrNull(Arg); + if (NewTy) { + // Copy over byval type for simd* type + ArgTys.push_back(NewTy); + } else { + // Transfer all non-simd ptr arguments + ArgTys.push_back(Arg->getType()); + } + } + + FunctionType *NFTy = FunctionType::get(RetTy, ArgTys, false); + + // Create new function body and insert into the module + Function *NF = Function::Create(NFTy, F.getLinkage(), F.getName()); + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + + SmallVector Returns; + SmallVector BitCasts; + ValueToValueMapTy VMap; + for (unsigned int I = 0; I != F.arg_size(); I++) { + auto Arg = F.getArg(I); + Type *newTy = getSimdArgPtrTyOrNull(Arg); + if (newTy) { + // bitcast vector* -> simd* + auto BitCast = new BitCastInst(NF->getArg(I), Arg->getType()); + BitCasts.push_back(BitCast); + VMap.insert(std::make_pair(Arg, BitCast)); + continue; + } + VMap.insert(std::make_pair(Arg, NF->getArg(I))); + } + + llvm::CloneFunctionInto(NF, &F, VMap, F.getSubprogram() != nullptr, Returns); + + for (auto &B : BitCasts) { + NF->begin()->getInstList().push_front(B); + } + + NF->takeName(&F); + + // Fix call sites + SmallVector, 10> OldNewInst; + for (auto &use : F.uses()) { + // Use must be a call site + SmallVector Params; + auto Call = cast(use.getUser()); + // Variadic functions not supported + assert(!Call->getFunction()->isVarArg() && + "Variadic functions not supported"); + for (unsigned int I = 0; I < Call->getNumArgOperands(); I++) { + auto SrcOpnd = Call->getOperand(I); + auto NewTy = getSimdArgPtrTyOrNull(SrcOpnd); + if (NewTy) { + auto BitCast = new BitCastInst(SrcOpnd, NewTy, "", Call); + Params.push_back(BitCast); + } else { + if (SrcOpnd != &F) + Params.push_back(SrcOpnd); + else + Params.push_back(NF); + } + } + // create new call instruction + auto NewCallInst = CallInst::Create(NFTy, NF, Params, ""); + NewCallInst->setCallingConv(F.getCallingConv()); + OldNewInst.push_back(std::make_pair(Call, NewCallInst)); + } + + for (auto InstPair : OldNewInst) { + auto OldInst = InstPair.first; + auto NewInst = InstPair.second; + ReplaceInstWithInst(OldInst, NewInst); + } + + F.eraseFromParent(); + + return NF; +} + +// Replace ConstantExpr if it contains old global variable. +ConstantExpr * +ESIMDLowerVecArgPass::createNewConstantExpr(GlobalVariable *NewGlobalVar, + Type *OldGlobalType, Value *Old) { + ConstantExpr *NewConstantExpr = nullptr; + + if (isa(Old)) { + NewConstantExpr = cast( + ConstantExpr::getBitCast(NewGlobalVar, OldGlobalType)); + return NewConstantExpr; + } + + auto InnerMost = createNewConstantExpr( + NewGlobalVar, OldGlobalType, cast(Old)->getOperand(0)); + + NewConstantExpr = cast( + cast(Old)->getWithOperandReplaced(0, InnerMost)); + + return NewConstantExpr; +} + +// Globals are part of ConstantExpr. This loop iterates over +// all such instances and replaces them with a new ConstantExpr +// consisting of new global vector* variable. +void ESIMDLowerVecArgPass::replaceConstExprWithGlobals(Module &M) { + for (auto &GlobalVars : OldNewGlobal) { + auto &G = *GlobalVars.first; + for (auto UseOfG : G.users()) { + auto NewGlobal = GlobalVars.second; + auto NewConstExpr = createNewConstantExpr(NewGlobal, G.getType(), UseOfG); + UseOfG->replaceAllUsesWith(NewConstExpr); + } + } +} + +// This function creates new global variables of type vector* type +// when old one is of simd* type. +void ESIMDLowerVecArgPass::fixGlobals(Module &M) { + for (auto &G : M.getGlobalList()) { + auto NewTy = getSimdArgPtrTyOrNull(&G); + if (NewTy && !G.user_empty()) { + // Peel off ptr type that getSimdArgPtrTyOrNull applies + NewTy = NewTy->getPointerElementType(); + auto ZeroInit = ConstantAggregateZero::get(NewTy); + auto NewGlobalVar = + new GlobalVariable(NewTy, G.isConstant(), G.getLinkage(), ZeroInit, + "", G.getThreadLocalMode(), G.getAddressSpace()); + NewGlobalVar->setExternallyInitialized(G.isExternallyInitialized()); + NewGlobalVar->copyAttributesFrom(&G); + NewGlobalVar->takeName(&G); + NewGlobalVar->copyMetadata(&G, 0); + M.getGlobalList().push_back(NewGlobalVar); + OldNewGlobal.insert(std::make_pair(&G, NewGlobalVar)); + } + } + + replaceConstExprWithGlobals(M); + + removeOldGlobals(); +} + +// Remove old global variables from the program. +void ESIMDLowerVecArgPass::removeOldGlobals() { + for (auto &G : OldNewGlobal) { + G.first->removeDeadConstantUsers(); + G.first->eraseFromParent(); + } +} + +bool ESIMDLowerVecArgPass::run(Module &M) { + fixGlobals(M); + + SmallVector functions; + for (auto &F : M) { + functions.push_back(&F); + } + + for (auto F : functions) { + for (unsigned int I = 0; I != F->arg_size(); I++) { + auto Arg = F->getArg(I); + if (getSimdArgPtrTyOrNull(Arg)) { + rewriteFunc(*F); + break; + } + } + } + + return true; +} diff --git a/llvm/test/SYCLLowerIR/esimd_global.ll b/llvm/test/SYCLLowerIR/esimd_global.ll new file mode 100644 index 0000000000000..ab86858fe4da7 --- /dev/null +++ b/llvm/test/SYCLLowerIR/esimd_global.ll @@ -0,0 +1,166 @@ +; This test checks whether globals are converted +; correctly to llvm's native vector type. +; +; RUN: opt < %s -ESIMDLowerVecArg -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown-sycldevice" + +%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" = type { <16 x i32> } + +$"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4Test" = comdat any + +; CHECK: [[NEWGLOBAL:[@a-zA-Z0-9_]*]] = dso_local global <16 x i32> zeroinitializer, align 64 #0 +@0 = dso_local global %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" zeroinitializer, align 64 #0 + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4Test"(i32 addrspace(1)* %_arg_) local_unnamed_addr #1 comdat !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 !sycl_explicit_simd !12 !intel_reqd_sub_group_size !8 { +entry: + %vc.i = alloca %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", align 64 + %agg.tmp.i = alloca %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", align 64 + %call.esimd.i.i.i.i.i = call <3 x i32> @llvm.genx.local.id.v3i32() #5 + %local_id.y.i.i.i.i.i = extractelement <3 x i32> %call.esimd.i.i.i.i.i, i32 1 + %local_id.y.cast.ty.i.i.i.i.i = zext i32 %local_id.y.i.i.i.i.i to i64 + %call.esimd1.i.i.i.i.i = call <3 x i32> @llvm.genx.local.size.v3i32() #5 + %wgsize.y.i.i.i.i.i = extractelement <3 x i32> %call.esimd1.i.i.i.i.i, i32 1 + %wgsize.y.cast.ty.i.i.i.i.i = zext i32 %wgsize.y.i.i.i.i.i to i64 + %group.id.y.i.i.i.i.i = call i32 @llvm.genx.group.id.y() #5 + %group.id.y.cast.ty.i.i.i.i.i = zext i32 %group.id.y.i.i.i.i.i to i64 + %mul.i.i.i.i.i = mul nuw i64 %wgsize.y.cast.ty.i.i.i.i.i, %group.id.y.cast.ty.i.i.i.i.i + %add.i.i.i.i.i = add i64 %mul.i.i.i.i.i, %local_id.y.cast.ty.i.i.i.i.i + %local_id.x.i.i.i.i.i = extractelement <3 x i32> %call.esimd.i.i.i.i.i, i32 0 + %local_id.x.cast.ty.i.i.i.i.i = zext i32 %local_id.x.i.i.i.i.i to i64 + %wgsize.x.i.i.i.i.i = extractelement <3 x i32> %call.esimd1.i.i.i.i.i, i32 0 + %wgsize.x.cast.ty.i.i.i.i.i = zext i32 %wgsize.x.i.i.i.i.i to i64 + %group.id.x.i.i.i.i.i = call i32 @llvm.genx.group.id.x() #5 + %group.id.x.cast.ty.i.i.i.i.i = zext i32 %group.id.x.i.i.i.i.i to i64 + %mul.i4.i.i.i.i = mul nuw i64 %group.id.x.cast.ty.i.i.i.i.i, %wgsize.x.cast.ty.i.i.i.i.i + %add.i5.i.i.i.i = add i64 %mul.i4.i.i.i.i, %local_id.x.cast.ty.i.i.i.i.i + %0 = bitcast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %agg.tmp.i to i8* + call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %0) + %1 = bitcast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %vc.i to i8* + call void @llvm.lifetime.start.p0i8(i64 64, i8* nonnull %1) #5 + %conv.i = trunc i64 %add.i5.i.i.i.i to i32 + %2 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %vc.i to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* + %splat.splatinsert.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0 + %splat.splat.i.i = shufflevector <16 x i32> %splat.splatinsert.i.i, <16 x i32> undef, <16 x i32> zeroinitializer + %M_data.i13.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %2, i64 0, i32 0 + store <16 x i32> %splat.splat.i.i, <16 x i32> addrspace(4)* %M_data.i13.i, align 64, !tbaa !13 + %conv3.i = trunc i64 %add.i.i.i.i.i to i32 + %splat.splatinsert.i20.i = insertelement <8 x i32> undef, i32 %conv3.i, i32 0 + %splat.splat.i21.i = shufflevector <8 x i32> %splat.splatinsert.i20.i, <8 x i32> undef, <8 x i32> zeroinitializer + %call.esimd.i.i.i.i.i2 = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i13.i) #5 + %call4.esimd.i.i.i.i = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.v8i1(<16 x i32> %call.esimd.i.i.i.i.i2, <8 x i32> %splat.splat.i21.i, i32 0, i32 8, i32 1, i16 0, i32 0, <8 x i1> ) #5 + call void @llvm.genx.vstore.v16i32.p4v16i32(<16 x i32> %call4.esimd.i.i.i.i, <16 x i32> addrspace(4)* %M_data.i13.i) #5 + %cmp.i = icmp eq i64 %add.i.i.i.i.i, 0 + %..i = select i1 %cmp.i, i64 %add.i5.i.i.i.i, i64 %add.i.i.i.i.i + %conv9.i = trunc i64 %..i to i32 +; CHECK: store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds ({{.+}}, {{.+}}* bitcast (<16 x i32>* [[NEWGLOBAL]] to {{.+}}*), i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 + store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* @0, i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 + %mul.i = shl nsw i32 %conv9.i, 4 + %idx.ext.i = sext i32 %mul.i to i64 + %add.ptr.i16 = getelementptr inbounds i32, i32 addrspace(1)* %_arg_, i64 %idx.ext.i + %add.ptr.i = addrspacecast i32 addrspace(1)* %add.ptr.i16 to i32 addrspace(4)* + %3 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %agg.tmp.i to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* + %call.esimd.i.i.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i13.i) #5 + %M_data.i2.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %3, i64 0, i32 0 + call void @llvm.genx.vstore.v16i32.p4v16i32(<16 x i32> %call.esimd.i.i.i, <16 x i32> addrspace(4)* %M_data.i2.i.i) #5 + call spir_func void @_Z3fooPiN2cl4sycl5intel3gpu4simdIiLi16EEE(i32 addrspace(4)* %add.ptr.i, %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* nonnull %agg.tmp.i) #5 + store <16 x i32> , <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* @0, i64 0, i32 0) to <16 x i32> addrspace(4)*), align 64, !tbaa.struct !16 + call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 64, i8* nonnull %0) + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg %0, i8* nocapture %1) #2 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg %0, i8* nocapture %1) #2 + +; Function Attrs: noinline norecurse nounwind +define dso_local spir_func void @_Z3fooPiN2cl4sycl5intel3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %v) local_unnamed_addr #3 !sycl_explicit_simd !12 { +entry: + %agg.tmp = alloca %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", align 64 + %0 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %v to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* + %1 = addrspacecast %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* %agg.tmp to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* + %M_data.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %0, i64 0, i32 0 + %call.esimd.i.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i.i), !noalias !17 +; CHECK: {{.+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* getelementptr ({{.+}}, {{.+}} addrspace(4)* addrspacecast ({{.+}}* bitcast (<16 x i32>* [[NEWGLOBAL]] to {{.+}}*) to {{.+}} addrspace(4)*), i64 0, i32 0)), !noalias !17 + %call.esimd.i8.i = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* getelementptr (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* addrspacecast (%"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd"* @0 to %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)*), i64 0, i32 0)), !noalias !17 + %add.i = add <16 x i32> %call.esimd.i8.i, %call.esimd.i.i + %M_data.i.i.i = getelementptr inbounds %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd", %"class._ZTSN2cl4sycl5intel3gpu4simdIiLi16EEE.cl::sycl::intel::gpu::simd" addrspace(4)* %1, i64 0, i32 0 + call void @llvm.genx.vstore.v16i32.p4v16i32(<16 x i32> %add.i, <16 x i32> addrspace(4)* %M_data.i.i.i) + %2 = ptrtoint i32 addrspace(4)* %C to i64 + %call.esimd.i.i2 = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %M_data.i.i.i) + call void @llvm.genx.svm.block.st.v16i32(i64 %2, <16 x i32> %call.esimd.i.i2) + ret void +} + +; Function Attrs: nounwind readnone +declare !genx_intrinsic_id !20 <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.i16.v8i1(<16 x i32> %0, <8 x i32> %1, i32 %2, i32 %3, i32 %4, i16 %5, i32 %6, <8 x i1> %7) #4 + +; Function Attrs: nounwind +declare !genx_intrinsic_id !21 <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* %0) #5 + +; Function Attrs: nounwind +declare !genx_intrinsic_id !22 void @llvm.genx.vstore.v16i32.p4v16i32(<16 x i32> %0, <16 x i32> addrspace(4)* %1) #5 + +; Function Attrs: nounwind +declare !genx_intrinsic_id !23 void @llvm.genx.svm.block.st.v16i32(i64 %0, <16 x i32> %1) #5 + +; Function Attrs: nounwind readnone +declare !genx_intrinsic_id !24 <3 x i32> @llvm.genx.local.id.v3i32() #4 + +; Function Attrs: nounwind readnone +declare !genx_intrinsic_id !25 <3 x i32> @llvm.genx.local.size.v3i32() #4 + +; Function Attrs: nounwind readnone +declare !genx_intrinsic_id !26 i32 @llvm.genx.group.id.y() #4 + +; Function Attrs: nounwind readnone +declare !genx_intrinsic_id !27 i32 @llvm.genx.group.id.x() #4 + +attributes #0 = { "genx_byte_offset"="192" "genx_volatile" } +attributes #1 = { norecurse "CMGenxMain" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "oclrt"="1" "stack-protector-buffer-size"="8" "sycl-module-id"="subroutine.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind willreturn } +attributes #3 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind readnone } +attributes #5 = { nounwind } + +!llvm.dependent-libraries = !{!0} +!llvm.module.flags = !{!1} +!opencl.spir.version = !{!2} +!spirv.Source = !{!3} +!llvm.ident = !{!4} +!genx.kernels = !{!5} + +!0 = !{!"libcpmt"} +!1 = !{i32 1, !"wchar_size", i32 2} +!2 = !{i32 1, i32 2} +!3 = !{i32 6, i32 100000} +!4 = !{!"clang version 11.0.0"} +!5 = !{void (i32 addrspace(1)*)* @"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4Test", !"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4Test", !6, i32 0, i32 0, !6, !7, i32 0, i32 0} +!6 = !{i32 0} +!7 = !{!"svmptr_t"} +!8 = !{i32 1} +!9 = !{!"none"} +!10 = !{!"int*"} +!11 = !{!""} +!12 = !{} +!13 = !{!14, !14, i64 0} +!14 = !{!"omnipotent char", !15, i64 0} +!15 = !{!"Simple C++ TBAA"} +!16 = !{i64 0, i64 64, !13} +!17 = !{!18} +!18 = distinct !{!18, !19, !"_ZNK2cl4sycl5intel3gpu4simdIiLi16EEplERKS4_: %agg.result"} +!19 = distinct !{!19, !"_ZNK2cl4sycl5intel3gpu4simdIiLi16EEplERKS4_"} +!20 = !{i32 8275} +!21 = !{i32 8268} +!22 = !{i32 8269} +!23 = !{i32 8166} +!24 = !{i32 8029} +!25 = !{i32 8034} +!26 = !{i32 8020} +!27 = !{i32 8019} + diff --git a/llvm/test/SYCLLowerIR/esimd_subroutine.ll b/llvm/test/SYCLLowerIR/esimd_subroutine.ll new file mode 100644 index 0000000000000..81bbcb9e0016b --- /dev/null +++ b/llvm/test/SYCLLowerIR/esimd_subroutine.ll @@ -0,0 +1,107 @@ +; This test checks whether subroutine arguments are converted +; correctly to llvm's native vector type. +; +; RUN: opt < %s -ESIMDLowerVecArg -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown-sycldevice" + +%class._ZTS4simdIiLi16EE.simd = type { <16 x i32> } + +$_ZN4simdIiLi16EEC1ERS0_ = comdat any + +$_ZN4simdIiLi16EEC2ERS0_ = comdat any + +; Function Attrs: norecurse nounwind +define spir_func void @_Z3fooi(i32 %x) #0 { +entry: + %x.addr = alloca i32, align 4 +; CHECK: {{.+}} = alloca {{.+}} +; CHECK-NEXT: [[A:%[a-zA-Z0-9_]*]] = alloca {{.+}} + %a = alloca %class._ZTS4simdIiLi16EE.simd, align 64 + %agg.tmp = alloca %class._ZTS4simdIiLi16EE.simd, align 64 + store i32 %x, i32* %x.addr, align 4, !tbaa !4 + %0 = bitcast %class._ZTS4simdIiLi16EE.simd* %a to i8* + call void @llvm.lifetime.start.p0i8(i64 64, i8* %0) #2 +; CHECK: [[ADDRSPCAST1:%[a-zA-Z0-9_]*]] = addrspacecast {{.+}} [[A]] to {{.+}} + %1 = addrspacecast %class._ZTS4simdIiLi16EE.simd* %agg.tmp to %class._ZTS4simdIiLi16EE.simd addrspace(4)* + %2 = addrspacecast %class._ZTS4simdIiLi16EE.simd* %a to %class._ZTS4simdIiLi16EE.simd addrspace(4)* +; CHECK: [[BITCASTRESULT1:%[a-zA-Z0-9_]*]] = bitcast {{.+}} addrspace(4)* [[ADDRSPCAST1]] to <16 x i32> addrspace(4)* +; CHECK-NEXT: call spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* {{.+}}, <16 x i32> addrspace(4)* [[BITCASTRESULT1]]) + call spir_func void @_ZN4simdIiLi16EEC1ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %1, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %2) +; CHECK: [[BITCASTRESULT2:%[a-zA-Z0-9_]*]] = bitcast {{.+}} to <16 x i32>* +; CHECK-NEXT: {{.+}} = call spir_func i32 {{.+}}bar{{.+}}(<16 x i32>* [[BITCASTRESULT2]]) + %call = call spir_func i32 @_Z3bar4simdIiLi16EE(%class._ZTS4simdIiLi16EE.simd* %agg.tmp) + %3 = bitcast %class._ZTS4simdIiLi16EE.simd* %a to i8* + call void @llvm.lifetime.end.p0i8(i64 64, i8* %3) #2 + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +; CHECK: define spir_func i32 @_Z3bar4simdIiLi16EE(<16 x i32>* {{.+}} +define spir_func i32 @_Z3bar4simdIiLi16EE(%class._ZTS4simdIiLi16EE.simd* %v) #0 { +entry: +; CHECK: {{.+}} = bitcast <16 x i32>* {{.+}} + ret i32 1 +} + +; Function Attrs: norecurse nounwind +; CHECK: define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(<16 x i32> addrspace(4)* [[OLDARG0:%[a-zA-Z0-9_]*]], <16 x i32> addrspace(4)* [[OLDARG1:%[a-zA-Z0-9_]*]]) unnamed_addr {{.+}} +define linkonce_odr spir_func void @_ZN4simdIiLi16EEC1ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %other) unnamed_addr #0 comdat align 2 { +entry: +; CHECK: [[NEWARG1:%[a-zA-Z0-9_]*]] = bitcast <16 x i32> addrspace(4)* [[OLDARG1]] to {{.+}} +; CHECK-NEXT: [[NEWARG0:%[a-zA-Z0-9_]*]] = bitcast <16 x i32> addrspace(4)* [[OLDARG0]] to {{.+}} + %this.addr = alloca %class._ZTS4simdIiLi16EE.simd addrspace(4)*, align 8 + %other.addr = alloca %class._ZTS4simdIiLi16EE.simd addrspace(4)*, align 8 +; CHECK: store {{.+}} addrspace(4)* [[NEWARG0]], {{.+}} + store %class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %this.addr, align 8, !tbaa !8 +; CHECK-NEXT: store {{.+}} addrspace(4)* [[NEWARG1]], {{.+}} + store %class._ZTS4simdIiLi16EE.simd addrspace(4)* %other, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %other.addr, align 8, !tbaa !8 + %this1 = load %class._ZTS4simdIiLi16EE.simd addrspace(4)*, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %this.addr, align 8 + %0 = load %class._ZTS4simdIiLi16EE.simd addrspace(4)*, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %other.addr, align 8 + call spir_func void @_ZN4simdIiLi16EEC2ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %this1, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %0) + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_ZN4simdIiLi16EEC2ERS0_(%class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)* align 64 dereferenceable(64) %other) unnamed_addr #0 comdat align 2 { +entry: + %this.addr = alloca %class._ZTS4simdIiLi16EE.simd addrspace(4)*, align 8 + %other.addr = alloca %class._ZTS4simdIiLi16EE.simd addrspace(4)*, align 8 + store %class._ZTS4simdIiLi16EE.simd addrspace(4)* %this, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %this.addr, align 8, !tbaa !8 + store %class._ZTS4simdIiLi16EE.simd addrspace(4)* %other, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %other.addr, align 8, !tbaa !8 + %this1 = load %class._ZTS4simdIiLi16EE.simd addrspace(4)*, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %this.addr, align 8 + %0 = load %class._ZTS4simdIiLi16EE.simd addrspace(4)*, %class._ZTS4simdIiLi16EE.simd addrspace(4)** %other.addr, align 8, !tbaa !8 + %__M_data = getelementptr inbounds %class._ZTS4simdIiLi16EE.simd, %class._ZTS4simdIiLi16EE.simd addrspace(4)* %0, i32 0, i32 0 + %1 = load <16 x i32>, <16 x i32> addrspace(4)* %__M_data, align 64, !tbaa !10 + %__M_data2 = getelementptr inbounds %class._ZTS4simdIiLi16EE.simd, %class._ZTS4simdIiLi16EE.simd addrspace(4)* %this1, i32 0, i32 0 + store <16 x i32> %1, <16 x i32> addrspace(4)* %__M_data2, align 64, !tbaa !10 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!opencl.spir.version = !{!1} +!spirv.Source = !{!2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{i32 4, i32 100000} +!3 = !{!"clang version 11.0.0 (https://github.com/kbobrovs/llvm.git fb752d6351dc6785f5438b137a86fa39a3493225)"} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} +!8 = !{!9, !9, i64 0} +!9 = !{!"any pointer", !6, i64 0} +!10 = !{!6, !6, i64 0} diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 8c0a0430b1e8e..fa32a1916d65c 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -596,6 +596,7 @@ int main(int argc, char **argv) { initializeSYCLLowerWGScopeLegacyPassPass(Registry); initializeSYCLLowerESIMDLegacyPassPass(Registry); initializeESIMDLowerLoadStorePass(Registry); + initializeESIMDLowerVecArgLegacyPassPass(Registry); #ifdef BUILD_EXAMPLES initializeExampleIRTransforms(Registry);