Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5827,8 +5827,9 @@ class OffloadingActionBuilder final {
DA.add(*DeviceWrappingAction, *TC, BoundArch, Action::OFK_SYCL);
continue;
}
if (isNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
// When compiling for Nvidia/CUDA devices and the user requested the
if ((isNVPTX || isAMDGCN) &&
Args.hasArg(options::OPT_fsycl_embed_ir)) {
// When compiling for Nvidia/AMD devices and the user requested the
// IR to be embedded in the application (via option), run the output
// of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
// through the offload wrapper and link the resulting object to the
Expand Down
26 changes: 25 additions & 1 deletion sycl-fusion/common/include/Kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,30 @@ namespace jit_compiler {

using BinaryAddress = const uint8_t *;

/// Possible barrier flags
enum class BarrierFlags : uint32_t {
None = 0, // Do not insert barrier
Local = 1, // Ensure correct ordering of memory operations to local memory
Global = 2, // Ensure correct ordering of memory operations to global memory
LocalAndGlobal = Local | Global
};

constexpr BarrierFlags getNoBarrierFlag() { return BarrierFlags::None; }
constexpr BarrierFlags getLocalAndGlobalBarrierFlag() {
return BarrierFlags::LocalAndGlobal;
}
constexpr bool isNoBarrierFlag(BarrierFlags Flag) {
return Flag == BarrierFlags::None;
}
constexpr bool hasLocalBarrierFlag(BarrierFlags Flag) {
return static_cast<uint32_t>(Flag) &
static_cast<uint32_t>(BarrierFlags::Local);
}
constexpr bool hasGlobalBarrierFlag(BarrierFlags Flag) {
return static_cast<uint32_t>(Flag) &
static_cast<uint32_t>(BarrierFlags::Global);
}

///
/// Enumerate possible kinds of parameters.
/// 1:1 correspondence with the definition in kernel_desc.hpp in the DPC++ SYCL
Expand All @@ -35,7 +59,7 @@ enum class ParameterKind : uint32_t {
};

/// Different binary formats supported as input to the JIT compiler.
enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX };
enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX, AMDGCN };

/// Information about a device intermediate representation module (e.g., SPIR-V,
/// LLVM IR) from DPC++.
Expand Down
1 change: 1 addition & 0 deletions sycl-fusion/common/lib/KernelIO.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ template <> struct ScalarEnumerationTraits<jit_compiler::BinaryFormat> {
IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM);
IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV);
IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX);
IO.enumCase(BF, "AMDGCN", jit_compiler::BinaryFormat::AMDGCN);
IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID);
}
};
Expand Down
7 changes: 7 additions & 0 deletions sycl-fusion/jit-compiler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ add_llvm_library(sycl-fusion
lib/fusion/ModuleHelper.cpp
lib/helper/ConfigHelper.cpp

DEPENDS
intrinsics_gen

LINK_COMPONENTS
BitReader
Core
Expand Down Expand Up @@ -50,6 +53,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX)
endif()

if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
endif()

if (BUILD_SHARED_LIBS)
if(NOT MSVC AND NOT APPLE)
# Manage symbol visibility through the linker to make sure no LLVM symbols
Expand Down
3 changes: 2 additions & 1 deletion sycl-fusion/jit-compiler/include/JITContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "Hashing.h"
#include "Kernel.h"
#include "Options.h"
#include "Parameter.h"

namespace llvm {
Expand All @@ -28,7 +29,7 @@ class LLVMContext;
namespace jit_compiler {

using CacheKeyT =
std::tuple<std::vector<std::string>, ParamIdentList, int,
std::tuple<std::vector<std::string>, ParamIdentList, BarrierFlags,
std::vector<ParameterInternalization>, std::vector<JITConstant>,
// This field of the cache is optional because, if all of the
// ranges are equal, we will perform no remapping, so that fused
Expand Down
3 changes: 2 additions & 1 deletion sycl-fusion/jit-compiler/include/KernelFusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ class KernelFusion {
const std::vector<SYCLKernelInfo> &KernelInformation,
const std::vector<std::string> &KernelsToFuse,
const std::string &FusedKernelName,
jit_compiler::ParamIdentList &Identities, int BarriersFlags,
jit_compiler::ParamIdentList &Identities,
BarrierFlags BarriersFlags,
const std::vector<jit_compiler::ParameterInternalization>
&Internalization,
const std::vector<jit_compiler::JITConstant> &JITConstants);
Expand Down
16 changes: 13 additions & 3 deletions sycl-fusion/jit-compiler/lib/KernelFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
#else // FUSION_JIT_SUPPORT_PTX
return false;
#endif // FUSION_JIT_SUPPORT_PTX
}
case BinaryFormat::AMDGCN: {
#ifdef FUSION_JIT_SUPPORT_AMDGCN
return true;
#else // FUSION_JIT_SUPPORT_AMDGCN
return false;
#endif // FUSION_JIT_SUPPORT_AMDGCN
}
default:
return false;
Expand All @@ -69,7 +76,7 @@ FusionResult KernelFusion::fuseKernels(
const std::vector<SYCLKernelInfo> &KernelInformation,
const std::vector<std::string> &KernelsToFuse,
const std::string &FusedKernelName, ParamIdentList &Identities,
int BarriersFlags,
BarrierFlags BarriersFlags,
const std::vector<jit_compiler::ParameterInternalization> &Internalization,
const std::vector<jit_compiler::JITConstant> &Constants) {
// Initialize the configuration helper to make the options for this invocation
Expand All @@ -93,8 +100,11 @@ FusionResult KernelFusion::fuseKernels(
"Fusion output target format not supported by this build");
}

if (TargetFormat == BinaryFormat::PTX && IsHeterogeneousList) {
return FusionResult{"Heterogeneous ND ranges not supported for CUDA"};
if ((TargetFormat == BinaryFormat::PTX ||
TargetFormat == BinaryFormat::AMDGCN) &&
IsHeterogeneousList) {
return FusionResult{
"Heterogeneous ND ranges not supported for CUDA and HIP"};
}

bool CachingEnabled = ConfigHelper::get<option::JITEnableCaching>();
Expand Down
4 changes: 2 additions & 2 deletions sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static unsigned getFlatAddressSpace(Module &Mod) {
// Ideally, we could get this information from the TargetTransformInfo, but
// the SPIR-V backend does not yet seem to have an implementation for that.
llvm::Triple Tri(Mod.getTargetTriple());
if (Tri.isNVPTX()) {
if (Tri.isNVPTX() || Tri.isAMDGCN()) {
return 0;
}
if (Tri.isSPIRV() || Tri.isSPIR()) {
Expand All @@ -53,7 +53,7 @@ static unsigned getFlatAddressSpace(Module &Mod) {

std::unique_ptr<SYCLModuleInfo>
FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
int BarriersFlags) {
BarrierFlags BarriersFlags) {
// Perform the actual kernel fusion, i.e., generate a kernel function for the
// fused kernel from the kernel functions of the input kernels. This is done
// by the SYCLKernelFusion LLVM pass, which is run here through a custom LLVM
Expand Down
2 changes: 1 addition & 1 deletion sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class FusionPipeline {
/// contain an entry for the fused kernel.
static std::unique_ptr<SYCLModuleInfo>
runFusionPasses(llvm::Module &Mod, SYCLModuleInfo &InputInfo,
int BarriersFlags);
BarrierFlags BarriersFlags);
};
} // namespace fusion
} // namespace jit_compiler
Expand Down
82 changes: 82 additions & 0 deletions sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,15 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
KernelBin = *BinaryOrError;
break;
}
case BinaryFormat::AMDGCN: {
llvm::Expected<KernelBinary *> BinaryOrError =
translateToAMDGCN(Kernel, Mod, JITCtx);
if (auto Error = BinaryOrError.takeError()) {
return Error;
}
KernelBin = *BinaryOrError;
break;
}
default: {
return createStringError(
inconvertibleErrorCode(),
Expand Down Expand Up @@ -287,3 +296,76 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
#endif // FUSION_JIT_SUPPORT_PTX
}

llvm::Expected<KernelBinary *>
KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
llvm::Module &Mod, JITContext &JITCtx) {
#ifndef FUSION_JIT_SUPPORT_AMDGCN
(void)KernelInfo;
(void)Mod;
(void)JITCtx;
return createStringError(inconvertibleErrorCode(),
"AMDGPU translation not supported in this build");
#else // FUSION_JIT_SUPPORT_AMDGCN

LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUTarget();
LLVMInitializeAMDGPUAsmPrinter();
LLVMInitializeAMDGPUTargetMC();

static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";

std::string TargetTriple{"amdgcn-amd-amdhsa"};

std::string ErrorMessage;
const auto *Target =
llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage);

if (!Target) {
return createStringError(
inconvertibleErrorCode(),
"Failed to load and translate AMDGCN LLVM IR module with error %s",
ErrorMessage.c_str());
}

// Set to the lowest tested target according to the GetStartedGuide, section
// "Build DPC++ toolchain with support for HIP AMD"
llvm::StringRef TargetCPU{"gfx906"};
llvm::StringRef TargetFeatures{""};
if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) {
if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
TargetCPU =
KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
}
if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
.getValueAsString();
}
}

// FIXME: Check whether we can provide more accurate target information here
auto *TargetMachine = Target->createTargetMachine(
TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
std::nullopt, llvm::CodeGenOpt::Default);

std::string AMDObj;
{
llvm::legacy::PassManager PM;
llvm::raw_string_ostream OBJStream{AMDObj};
llvm::buffer_ostream BufferedOBJ{OBJStream};

if (TargetMachine->addPassesToEmitFile(PM, BufferedOBJ, nullptr,
llvm::CGFT_ObjectFile)) {
return createStringError(
inconvertibleErrorCode(),
"Failed to construct pass pipeline to emit output");
}

PM.run(Mod);
OBJStream.flush();
}

return &JITCtx.emplaceKernelBinary(std::move(AMDObj), BinaryFormat::AMDGCN);
#endif // FUSION_JIT_SUPPORT_AMDGCN
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ class KernelTranslator {

static llvm::Expected<KernelBinary *>
translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);

static llvm::Expected<KernelBinary *>
translateToAMDGCN(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
JITContext &JITCtx);
};
} // namespace translation
} // namespace jit_compiler
Expand Down
8 changes: 8 additions & 0 deletions sycl-fusion/passes/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX)
endif()

if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
endif()

# Static library for linking with the jit_compiler
add_llvm_library(SYCLKernelFusionPasses
SYCLFusionPasses.cpp
Expand Down Expand Up @@ -68,3 +72,7 @@ target_link_libraries(SYCLKernelFusionPasses
if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX)
endif()

if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
endif()
8 changes: 6 additions & 2 deletions sycl-fusion/passes/SYCLFusionPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Passes/PassPlugin.h"

#include "Kernel.h"

#include "internalization/Internalization.h"
#include "kernel-fusion/SYCLKernelFusion.h"
#include "kernel-info/SYCLKernelInfo.h"
#include "syclcp/SYCLCP.h"

using namespace llvm;
using namespace jit_compiler;

cl::opt<bool>
NoBarriers("sycl-kernel-fusion-no-barriers",
Expand All @@ -28,8 +31,9 @@ llvm::PassPluginLibraryInfo getSYCLKernelFusionPluginInfo() {
[](StringRef Name, ModulePassManager &MPM,
ArrayRef<PassBuilder::PipelineElement>) {
if (Name == "sycl-kernel-fusion") {
int BarrierFlag =
(NoBarriers) ? -1 : SYCLKernelFusion::DefaultBarriersFlags;
BarrierFlags BarrierFlag =
(NoBarriers) ? getNoBarrierFlag()
: SYCLKernelFusion::DefaultBarriersFlags;
MPM.addPass(SYCLKernelFusion(BarrierFlag));
return true;
}
Expand Down
15 changes: 7 additions & 8 deletions sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,12 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder,
return {Entry, CallInsertion, Exit};
}

static Expected<CallInst *>
createFusionCall(IRBuilderBase &Builder, Function *F,
ArrayRef<Value *> CallArgs,
const jit_compiler::NDRange &SrcNDRange,
const jit_compiler::NDRange &FusedNDRange, bool IsLast,
int BarriersFlags, jit_compiler::Remapper &Remapper,
bool ShouldRemap, TargetFusionInfo &TargetInfo) {
static Expected<CallInst *> createFusionCall(
IRBuilderBase &Builder, Function *F, ArrayRef<Value *> CallArgs,
const jit_compiler::NDRange &SrcNDRange,
const jit_compiler::NDRange &FusedNDRange, bool IsLast,
jit_compiler::BarrierFlags BarriersFlags, jit_compiler::Remapper &Remapper,
bool ShouldRemap, TargetFusionInfo &TargetInfo) {
const auto IPs = addGuard(Builder, SrcNDRange, FusedNDRange, IsLast);

if (ShouldRemap) {
Expand All @@ -261,7 +260,7 @@ createFusionCall(IRBuilderBase &Builder, Function *F,
Builder.SetInsertPoint(IPs.Exit);

// Insert barrier if needed
if (!IsLast && BarriersFlags > 0) {
if (!IsLast && !jit_compiler::isNoBarrierFlag(BarriersFlags)) {
TargetInfo.createBarrierCall(Builder, BarriersFlags);
}

Expand Down
11 changes: 4 additions & 7 deletions sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
constexpr static llvm::StringLiteral NDRangesMDKey{"sycl.kernel.nd-ranges"};

constexpr SYCLKernelFusion() = default;
constexpr explicit SYCLKernelFusion(int BarriersFlags)
constexpr explicit SYCLKernelFusion(jit_compiler::BarrierFlags BarriersFlags)
: BarriersFlags{BarriersFlags} {}

llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
Expand All @@ -45,7 +45,8 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
///
/// By default, correct ordering of memory operations to global memory is
/// ensured.
constexpr static int DefaultBarriersFlags{3};
constexpr static jit_compiler::BarrierFlags DefaultBarriersFlags{
jit_compiler::getLocalAndGlobalBarrierFlag()};

private:
// This needs to be in sync with the metadata kind
Expand Down Expand Up @@ -155,11 +156,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
///
/// Flags to apply to the barrier to be introduced between fused kernels.
///
/// Possible values:
/// - -1: Do not insert barrier
/// - 1: ensure correct ordering of memory operations to local memory
/// - 2: ensure correct ordering of memory operations to global memory
const int BarriersFlags{DefaultBarriersFlags};
const jit_compiler::BarrierFlags BarriersFlags{DefaultBarriersFlags};

///
/// Merge the content of Other into Attributes, adding, removing or updating
Expand Down
Loading