Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3f0ce59
[SYCL] Get rid of device kernel info duplication
sergey-semenov Sep 9, 2025
1343d13
[SYCL] Use 'kernel's info instance for more image origins
sergey-semenov Oct 10, 2025
65e213b
Merge branch 'kernelinfo' into pmduplication
sergey-semenov Oct 10, 2025
3157885
Check kernel info ownership with mixed image origins
sergey-semenov Oct 16, 2025
68df2eb
Adjust kernel_from_file test
sergey-semenov Oct 16, 2025
f1b796c
Merge branch 'sycl' into pmduplication
sergey-semenov Oct 16, 2025
41a8a01
Merge branch 'sycl' into pmduplication
sergey-semenov Oct 20, 2025
0e1e67e
Update unit tests to contain image binaries
sergey-semenov Oct 20, 2025
40d1ee6
Merge branch 'sycl' into pmduplication
sergey-semenov Nov 5, 2025
3d952d5
Fix compilation
sergey-semenov Nov 6, 2025
a1c62d5
Merge branch 'sycl' into pmduplication
sergey-semenov Nov 6, 2025
a7a31e8
Appease clang-format
sergey-semenov Nov 6, 2025
b9f0548
Merge branch 'sycl' into pmduplication
sergey-semenov Nov 18, 2025
168d0a4
Merge branch 'sycl' into pmduplication
sergey-semenov Dec 3, 2025
1a663ba
Apply clang-format
sergey-semenov Dec 3, 2025
ac56e0a
Make implicit local arg member const
sergey-semenov Dec 3, 2025
8e4fe98
Fix build error
sergey-semenov Dec 4, 2025
59f8a66
Revert "Fix build error"
sergey-semenov Dec 8, 2025
83260d5
Revert "Make implicit local arg member const"
sergey-semenov Dec 8, 2025
00ee337
Add clarifying comment for implicitlocalargpos
sergey-semenov Dec 8, 2025
dbb4cc9
Minor update
sergey-semenov Dec 8, 2025
0dfc937
Merge branch 'sycl' into pmduplication
sergey-semenov Dec 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions sycl/source/detail/device_kernel_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,7 @@ inline namespace _V1 {
namespace detail {

DeviceKernelInfo::DeviceKernelInfo(const CompileTimeKernelInfoTy &Info)
: CompileTimeKernelInfoTy(Info) {
init(Name.data());
}

void DeviceKernelInfo::init(std::string_view KernelName) {
auto &PM = detail::ProgramManager::getInstance();
MImplicitLocalArgPos = PM.kernelImplicitLocalArgPos(KernelName);
}
: CompileTimeKernelInfoTy(Info) {}

template <typename OtherTy>
inline constexpr bool operator==(const CompileTimeKernelInfoTy &LHS,
Expand Down Expand Up @@ -50,6 +43,10 @@ void DeviceKernelInfo::setCompileTimeInfoIfNeeded(
assert(Info == *this);
}

void DeviceKernelInfo::setImplicitLocalArgPos(int Pos) {
assert(!MImplicitLocalArgPos.has_value() || MImplicitLocalArgPos == Pos);
MImplicitLocalArgPos = Pos;
}
} // namespace detail
} // namespace _V1
} // namespace sycl
6 changes: 5 additions & 1 deletion sycl/source/detail/device_kernel_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,14 @@ class DeviceKernelInfo : public CompileTimeKernelInfoTy {

FastKernelSubcacheT &getKernelSubcache() { return MFastKernelSubcache; }

std::optional<int> getImplicitLocalArgPos() const {
const std::optional<int> &getImplicitLocalArgPos() const {
return MImplicitLocalArgPos;
}

// Implicit local argument position is used only for some backends, so this
// funciton allows setting it as more images are added.
void setImplicitLocalArgPos(int Pos);

private:
bool isCompileTimeInfoSet() const { return KernelSize != 0; }

Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/get_device_kernel_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ inline namespace _V1 {
namespace detail {

DeviceKernelInfo &getDeviceKernelInfo(const CompileTimeKernelInfoTy &Info) {
return ProgramManager::getInstance().getOrCreateDeviceKernelInfo(Info);
return ProgramManager::getInstance().getDeviceKernelInfo(Info);
}

} // namespace detail
Expand Down
22 changes: 17 additions & 5 deletions sycl/source/detail/kernel_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ kernel_impl::kernel_impl(Managed<ur_kernel_handle_t> &&Kernel,
MCreatedFromSource(true),
MKernelBundleImpl(KernelBundleImpl ? KernelBundleImpl->shared_from_this()
: nullptr),
MIsInterop(true), MKernelArgMaskPtr{ArgMask},
MInteropDeviceKernelInfo(createCompileTimeKernelInfo(getName())) {
MIsInterop(true), MKernelArgMaskPtr{ArgMask}, MOwnsDeviceKernelInfo(true),
MDeviceKernelInfo(createCompileTimeKernelInfo(getName())) {
ur_context_handle_t UrContext = nullptr;
// Using the adapter from the passed ContextImpl
getAdapter().call<UrApiKind::urKernelGetInfo>(
Expand All @@ -59,9 +59,11 @@ kernel_impl::kernel_impl(Managed<ur_kernel_handle_t> &&Kernel,
MKernelBundleImpl(KernelBundleImpl.shared_from_this()),
MIsInterop(MDeviceImageImpl->getOriginMask() & ImageOriginInterop),
MKernelArgMaskPtr{ArgMask}, MCacheMutex{CacheMutex},
MInteropDeviceKernelInfo(MIsInterop
? createCompileTimeKernelInfo(getName())
: createCompileTimeKernelInfo()) {
MOwnsDeviceKernelInfo(checkOwnsDeviceKernelInfo()),
MDeviceKernelInfo(MOwnsDeviceKernelInfo
? createCompileTimeKernelInfo(getName())
: createCompileTimeKernelInfo()) {

// Enable USM indirect access for interop and non-sycl-jit source kernels.
// sycl-jit kernels will enable this if needed through the regular kernel
// path.
Expand Down Expand Up @@ -121,6 +123,16 @@ std::string_view kernel_impl::getName() const {
return MName;
}

bool kernel_impl::checkOwnsDeviceKernelInfo() {
// If the image originates from something other than standard offline
// compilation, this kernel needs to own its info structure.
// We could also have a mixed origin image, in which case the device kernel
// info might reside in program manager.
return MDeviceImageImpl->getOriginMask() != ImageOriginSYCLOffline &&
(!(MDeviceImageImpl->getOriginMask() & ImageOriginSYCLOffline) ||
!ProgramManager::getInstance().tryGetDeviceKernelInfo(getName()));
}

bool kernel_impl::isBuiltInKernel(device_impl &Device) const {
auto BuiltInKernels = Device.get_info<info::device::built_in_kernel_ids>();
if (BuiltInKernels.empty())
Expand Down
13 changes: 8 additions & 5 deletions sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,11 @@ class kernel_impl {
std::mutex *getCacheMutex() const { return MCacheMutex; }
std::string_view getName() const;

bool checkOwnsDeviceKernelInfo();
DeviceKernelInfo &getDeviceKernelInfo() {
return MIsInterop
? MInteropDeviceKernelInfo
: ProgramManager::getInstance().getOrCreateDeviceKernelInfo(
return MOwnsDeviceKernelInfo
? MDeviceKernelInfo
: ProgramManager::getInstance().getDeviceKernelInfo(
Comment on lines +240 to +242
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just have a reference/pointer as the member (potentially pointing to the owning smart pointer member) instead of doing those lookups?

Copy link
Contributor Author

@sergey-semenov sergey-semenov Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we should. I was planning that as a follow-up change since the lookup is already there right now.

std::string_view(getName()));
}

Expand All @@ -255,9 +256,11 @@ class kernel_impl {
std::mutex *MCacheMutex = nullptr;
mutable std::string MName;

// It is used for the interop kernels only.
// Used for images that aren't obtained with standard SYCL offline
// compilation.
// For regular kernel we get DeviceKernelInfo from the ProgramManager.
DeviceKernelInfo MInteropDeviceKernelInfo;
bool MOwnsDeviceKernelInfo = false;
DeviceKernelInfo MDeviceKernelInfo;

bool isBuiltInKernel(device_impl &Device) const;
void checkIfValidForNumArgsInfoQuery() const;
Expand Down
38 changes: 24 additions & 14 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1510,27 +1510,34 @@ void ProgramManager::cacheKernelImplicitLocalArg(
Img.getImplicitLocalArg();
if (ImplicitLocalArgRange.isAvailable())
for (auto Prop : ImplicitLocalArgRange) {
m_KernelImplicitLocalArgPos[Prop->Name] =
DeviceBinaryProperty(Prop).asUint32();
auto It = m_DeviceKernelInfoMap.find(Prop->Name);
assert(It != m_DeviceKernelInfoMap.end());
It->second.setImplicitLocalArgPos(DeviceBinaryProperty(Prop).asUint32());
}
}

DeviceKernelInfo &ProgramManager::getOrCreateDeviceKernelInfo(
const CompileTimeKernelInfoTy &Info) {
DeviceKernelInfo &
ProgramManager::getDeviceKernelInfo(const CompileTimeKernelInfoTy &Info) {
std::lock_guard<std::mutex> Guard(m_DeviceKernelInfoMapMutex);
auto [Iter, Inserted] = m_DeviceKernelInfoMap.try_emplace(Info.Name, Info);
if (!Inserted)
Iter->second.setCompileTimeInfoIfNeeded(Info);
return Iter->second;
auto It = m_DeviceKernelInfoMap.find(Info.Name);
assert(It != m_DeviceKernelInfoMap.end());
It->second.setCompileTimeInfoIfNeeded(Info);
return It->second;
}

DeviceKernelInfo &
ProgramManager::getOrCreateDeviceKernelInfo(std::string_view KernelName) {
ProgramManager::getDeviceKernelInfo(std::string_view KernelName) {
std::lock_guard<std::mutex> Guard(m_DeviceKernelInfoMapMutex);
CompileTimeKernelInfoTy DefaultCompileTimeInfo{KernelName};
auto Result =
m_DeviceKernelInfoMap.try_emplace(KernelName, DefaultCompileTimeInfo);
return Result.first->second;
auto It = m_DeviceKernelInfoMap.find(KernelName);
assert(It != m_DeviceKernelInfoMap.end());
return It->second;
}

DeviceKernelInfo *
ProgramManager::tryGetDeviceKernelInfo(std::string_view KernelName) {
std::lock_guard<std::mutex> Guard(m_DeviceKernelInfoMapMutex);
auto It = m_DeviceKernelInfoMap.find(KernelName);
return It != m_DeviceKernelInfoMap.end() ? &It->second : nullptr;
}

static bool isBfloat16DeviceLibImage(sycl_device_binary RawImg,
Expand Down Expand Up @@ -1733,6 +1740,10 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
m_KernelIDs2BinImage.insert(std::make_pair(It->second, Img.get()));
KernelIDs->push_back(It->second);

CompileTimeKernelInfoTy DefaultCompileTimeInfo{std::string_view(name)};
m_DeviceKernelInfoMap.try_emplace(std::string_view(name),
DefaultCompileTimeInfo);

// Keep track of image to kernel name reference count for cleanup.
m_KernelNameRefCount[name]++;
}
Expand Down Expand Up @@ -1924,7 +1935,6 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) {
if (--RefCount == 0) {
// TODO aggregate all these maps into a single one since their entries
// share lifetime.
m_KernelImplicitLocalArgPos.erase(Name);
m_DeviceKernelInfoMap.erase(Name);
m_KernelNameRefCount.erase(RefCountIt);
if (Name2IDIt != m_KernelName2KernelIDs.end())
Expand Down
23 changes: 4 additions & 19 deletions sycl/source/detail/program_manager/program_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,17 +381,11 @@ class ProgramManager {

SanitizerType kernelUsesSanitizer() const { return m_SanitizerFoundInImage; }

std::optional<int>
kernelImplicitLocalArgPos(std::string_view KernelName) const {
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
if (it != m_KernelImplicitLocalArgPos.end())
return it->second;
return {};
}
void cacheKernelImplicitLocalArg(const RTDeviceBinaryImage &Img);

DeviceKernelInfo &
getOrCreateDeviceKernelInfo(const CompileTimeKernelInfoTy &Info);
DeviceKernelInfo &getOrCreateDeviceKernelInfo(std::string_view KernelName);
DeviceKernelInfo &getDeviceKernelInfo(const CompileTimeKernelInfoTy &Info);
DeviceKernelInfo &getDeviceKernelInfo(std::string_view KernelName);
DeviceKernelInfo *tryGetDeviceKernelInfo(std::string_view KernelName);

std::set<const RTDeviceBinaryImage *>
getRawDeviceImages(const std::vector<kernel_id> &KernelIDs);
Expand Down Expand Up @@ -420,9 +414,6 @@ class ProgramManager {
/// Dumps image to current directory
void dumpImage(const RTDeviceBinaryImage &Img, uint32_t SequenceID = 0) const;

/// Add info on kernels using local arg into cache
void cacheKernelImplicitLocalArg(const RTDeviceBinaryImage &Img);

std::set<const RTDeviceBinaryImage *>
collectDependentDeviceImagesForVirtualFunctions(
const RTDeviceBinaryImage &Img, const device_impl &Dev);
Expand Down Expand Up @@ -529,12 +520,6 @@ class ProgramManager {
bool m_UseSpvFile = false;
RTDeviceBinaryImageUPtr m_SpvFileImage;

// std::less<> is a transparent comparator that enabled comparison between
// different types without temporary key_type object creation. This includes
// standard overloads, such as comparison between std::string and
// std::string_view or just char*.
std::unordered_map<std::string_view, int> m_KernelImplicitLocalArgPos;

// Map for storing device kernel information. Runtime lookup should be avoided
// by caching the pointers when possible.
std::unordered_map<std::string_view, DeviceKernelInfo> m_DeviceKernelInfoMap;
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2438,7 +2438,7 @@ static ur_result_t SetKernelParamsAndLaunch(
applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
}

std::optional<int> ImplicitLocalArg =
const std::optional<int> &ImplicitLocalArg =
DeviceKernelInfo.getImplicitLocalArgPos();
// Set the implicit local memory buffer to support
// get_work_group_scratch_memory. This is for backend not supporting
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ detail::EventImplPtr handler::finalize() {
// Fetch the device kernel info pointer if it hasn't been set (e.g.
// in kernel bundle or free function cases).
impl->MKernelData.setDeviceKernelInfoPtr(
&detail::ProgramManager::getInstance().getOrCreateDeviceKernelInfo(
&detail::ProgramManager::getInstance().getDeviceKernelInfo(
std::string_view(MKernelName)));
}
assert(impl->MKernelData.getKernelName() == MKernelName);
Expand Down
17 changes: 11 additions & 6 deletions sycl/test-e2e/Config/kernel_from_file.cpp
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
// REQUIRES: target-spir

// FIXME Disabled fallback assert as it'll require either online linking or
// explicit offline linking step here
// FIXME separate compilation requires -fno-sycl-dead-args-optimization
// As we are doing a separate device compilation here, we need to explicitly
// add the device lib instrumentation (itt_compiler_wrapper)
// RUN: %clangxx -Wno-error=ignored-attributes -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 -fsycl-device-only -fno-sycl-dead-args-optimization -Xclang -fsycl-int-header=%t.h %s -o %t.bc -Xclang -verify-ignore-unexpected=note,warning -Wno-sycl-strict
// RUN: %clangxx -Wno-error=ignored-attributes -DUSED_KERNEL -fno-sycl-dead-args-optimization %cxx_std_optionc++17 -fsycl-device-only -Xclang -fsycl-int-header=%t.h %s -o %t.bc -Xclang -verify-ignore-unexpected=note,warning -Wno-sycl-strict
// >> ---- unbundle compiler wrapper and asan device objects
// RUN: clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-itt-compiler-wrappers%obj_ext -output=%t_compiler_wrappers.bc -unbundle
// RUN: %if linux %{ clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-asan%obj_ext -output=%t_asan.bc -unbundle %}
// >> ---- link device code
// RUN: %if linux %{ llvm-link -o=%t_app.bc %t.bc %t_compiler_wrappers.bc %t_asan.bc %} %else %{ llvm-link -o=%t_app.bc %t.bc %t_compiler_wrappers.bc %}
// >> ---- translate to SPIR-V
// RUN: llvm-spirv -o %t.spv %t_app.bc
// RUN: %clangxx -Wno-error=ignored-attributes %sycl_include -DSYCL_DISABLE_FALLBACK_ASSERT %cxx_std_optionc++17 %include_option %t.h %s -o %t.out %sycl_options -Xclang -verify-ignore-unexpected=note,warning %if preview-mode %{-Wno-unused-command-line-argument%}
// Need to perform full compilation here since the SYCL runtime uses image
// properties from the fat binary.
// RUN: %{build} -fno-sycl-dead-args-optimization -o %t.out
// RUN: env SYCL_USE_KERNEL_SPV=%t.spv %{run} %t.out

#include <iostream>
Expand All @@ -31,10 +31,15 @@ int main(int argc, char **argv) {
event e = myQueue.submit([&](handler &cgh) {
auto ptr = buf.get_access<access::mode::read_write>(cgh);

cgh.single_task<class my_kernel>([=]() { ptr[0]++; });
cgh.single_task<class my_kernel>([=]() {
#ifdef USED_KERNEL
ptr[0]++;
#else
ptr[0]--;
#endif
});
});
e.wait_and_throw();

} catch (sycl::exception const &e) {
std::cerr << "SYCL exception caught:\n";
std::cerr << e.what() << "\n";
Expand Down
6 changes: 6 additions & 0 deletions sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ class Kernel3;
MOCK_INTEGRATION_HEADER(Kernel1)
MOCK_INTEGRATION_HEADER(Kernel2)
MOCK_INTEGRATION_HEADER(Kernel3)
static sycl::unittest::MockDeviceImage CommandGraphImgs[3] = {
sycl::unittest::generateDefaultImage({"Kernel1"}),
sycl::unittest::generateDefaultImage({"Kernel2"}),
sycl::unittest::generateDefaultImage({"Kernel3"})};
static sycl::unittest::MockDeviceImageArray<3> CommandGraphImgArray{
CommandGraphImgs};

using namespace sycl;
using namespace sycl::ext::oneapi;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ using namespace sycl::ext::oneapi;

class MockKernel;
MOCK_INTEGRATION_HEADER(MockKernel)

static sycl::unittest::MockDeviceImage MockKernelImg =
sycl::unittest::generateDefaultImage({"MockKernel"});
static sycl::unittest::MockDeviceImageArray<1> MockKernelImgArray{
&MockKernelImg};
/**
* Checks that the operators and constructors of graph related classes meet the
* common reference semantics.
Expand Down
12 changes: 0 additions & 12 deletions sycl/unittests/program_manager/Cleanup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,6 @@ class ProgramManagerExposed : public sycl::detail::ProgramManager {
return m_EliminatedKernelArgMasks;
}

std::unordered_map<std::string_view, int> &getKernelImplicitLocalArgPos() {
return m_KernelImplicitLocalArgPos;
}

std::unordered_map<std::string,
std::unique_ptr<sycl::detail::HostPipeMapEntry>> &
getHostPipes() {
Expand Down Expand Up @@ -304,8 +300,6 @@ void checkAllInvolvedContainers(ProgramManagerExposed &PM,
"Kernel name reference count " + CommentPostfix);
EXPECT_EQ(PM.getEliminatedKernelArgMask().size(), ExpectedImgCount)
<< "Eliminated kernel arg mask " + CommentPostfix;
EXPECT_EQ(PM.getKernelImplicitLocalArgPos().size(), ExpectedEntryCount)
<< "Kernel implicit local arg pos " + CommentPostfix;

if (!MultipleImgsPerEntryTestCase) {
// FIXME expected to fail for now, device globals cleanup seems to be
Expand Down Expand Up @@ -355,10 +349,6 @@ TEST(ImageRemoval, BaseContainers) {
generateRefName("B", "HostPipe").c_str());
PM.addOrInitHostPipeEntry(PipeC::get_host_ptr(),
generateRefName("C", "HostPipe").c_str());
std::vector<std::string> KernelNames =
generateRefNames({"A", "B", "C"}, "Kernel");
for (const std::string &Name : KernelNames)
PM.getOrCreateDeviceKernelInfo(Name);

checkAllInvolvedContainers(PM, ImagesToRemove.size() + ImagesToKeep.size(),
{"A", "B", "C"}, "check failed before removal");
Expand All @@ -382,8 +372,6 @@ TEST(ImageRemoval, MultipleImagesPerEntry) {
convertAndAddImages(PM, ImagesToRemoveSameEntries, NativeImagesForRemoval,
TestBinaries);

std::string KernelName = generateRefName("A", "Kernel");
PM.getOrCreateDeviceKernelInfo(KernelName);
checkAllInvolvedContainers(
PM, ImagesToRemoveSameEntries.size() + ImagesToKeepSameEntries.size(),
/*ExpectedEntryCount*/ 1, {"A"}, "check failed before removal",
Expand Down