Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cd9818b
[SYCL][NFC] Call getCacheItemPath only if cache is enabled
bader Dec 28, 2021
04e3869
[SYCL][NFC] Don't include sycl.hpp from headers
bader Dec 28, 2021
ba29bbe
[SYCL][NFC] Factor out empty kernel creation boilerplate
bader Dec 28, 2021
f5b380b
[SYCL] Do not build device code for sub-devices.
bader Dec 23, 2021
5a3587e
Apply clang-format
bader Dec 28, 2021
28b7f80
Fix issues caught by pre-commit CI.
bader Dec 29, 2021
61e09bd
[NFC] Fix a few typos in the comments
bader Dec 29, 2021
d5b93f0
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Jan 20, 2022
a1e483a
Improved build results caching for GPU devices.
bader Jan 21, 2022
7ac48ae
Improve GPU caching.
bader Jan 24, 2022
d0f2861
Revert "Improve GPU caching."
bader Feb 8, 2022
231a1a3
Revert "Improved build results caching for GPU devices."
bader Feb 8, 2022
8f2d9c4
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Feb 8, 2022
d44e27f
Fix formatting.
bader Feb 8, 2022
6e310b0
Add device query for checking if device architecture is homogeneous
bader Feb 14, 2022
ce299cd
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Feb 14, 2022
e6ca4f9
Address code review feedback
bader Feb 15, 2022
bf57926
Added a FIXME comment.
bader Feb 17, 2022
d062d77
Merge remote-tracking branch 'intel/sycl' into optimize-build
bader Feb 17, 2022
0e650ea
Update sycl/source/detail/program_manager/program_manager.cpp
bader Feb 17, 2022
d1cc7aa
Move comment to Level Zero plug-in.
bader Feb 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ typedef enum {
PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025,
PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026,
PI_DEVICE_INFO_IMAGE_SRGB = 0x10027,
PI_DEVICE_INFO_HOMOGENEOUS_ARCH = 0x10028,
PI_DEVICE_INFO_ATOMIC_64 = 0x10110,
PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111,
PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000,
Expand Down
4 changes: 4 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,10 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
}
case PI_DEVICE_INFO_HOMOGENEOUS_ARCH: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_FALSE);
}
case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
Expand Down
2 changes: 2 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
return ReturnValue("");
case PI_DEVICE_INFO_VERSION:
return ReturnValue(CmEmuDeviceVersionString);
case PI_DEVICE_INFO_HOMOGENEOUS_ARCH: // emulator doesn't support partition
return ReturnValue(pi_bool{false});
case PI_DEVICE_INFO_COMPILER_AVAILABLE:
return ReturnValue(pi_bool{false});
case PI_DEVICE_INFO_LINKER_AVAILABLE:
Expand Down
4 changes: 4 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1409,6 +1409,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
}
case PI_DEVICE_INFO_HOMOGENEOUS_ARCH: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_FALSE);
}
case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
PI_TRUE);
Expand Down
5 changes: 3 additions & 2 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2277,8 +2277,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
}
}
case PI_DEVICE_INFO_PARENT_DEVICE:
// TODO: all Level Zero devices are parent ?
return ReturnValue(pi_device{0});
return ReturnValue(Device->RootDevice);
case PI_DEVICE_INFO_PLATFORM:
return ReturnValue(Device->Platform);
case PI_DEVICE_INFO_VENDOR_ID:
Expand Down Expand Up @@ -2338,6 +2337,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
}
case PI_DEVICE_INFO_NAME:
return ReturnValue(Device->ZeDeviceProperties->name);
case PI_DEVICE_INFO_HOMOGENEOUS_ARCH:
return ReturnValue(PI_TRUE);
case PI_DEVICE_INFO_COMPILER_AVAILABLE:
return ReturnValue(pi_bool{1});
case PI_DEVICE_INFO_LINKER_AVAILABLE:
Expand Down
8 changes: 7 additions & 1 deletion sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,13 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
std::memcpy(paramValue, &result, sizeof(cl_bool));
return PI_SUCCESS;
}

case PI_DEVICE_INFO_HOMOGENEOUS_ARCH: {
// FIXME: conservatively return false due to lack of low-level API exposing
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we maybe return true for Intel GPU's already to get OpenCL backend parity with Level-Zero?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a check for GPU type, but w/o a vendor check. I'm not sure how many OpenCL implementations supports device partition, but I guess it's done for homogeneous GPU only. Let me know if you want to harden the check.

// actual status of this property
cl_bool result = false;
std::memcpy(paramValue, &result, sizeof(cl_bool));
return PI_SUCCESS;
}
case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D:
// Returns the maximum sizes of a work group for each dimension one
// could use to submit a kernel. There is no such query defined in OpenCL
Expand Down
4 changes: 1 addition & 3 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle,
Plugin.call<PiApiKind::piDeviceGetInfo>(
MDevice, PI_DEVICE_INFO_TYPE, sizeof(RT::PiDeviceType), &MType, nullptr);

RT::PiDevice parent = nullptr;
// TODO catch an exception and put it to list of asynchronous exceptions
Plugin.call<PiApiKind::piDeviceGetInfo>(MDevice, PI_DEVICE_INFO_PARENT_DEVICE,
sizeof(RT::PiDevice), &parent,
sizeof(RT::PiDevice), &MRootDevice,
nullptr);

MIsRootDevice = (nullptr == parent);
if (!InteroperabilityConstructor) {
// TODO catch an exception and put it to list of asynchronous exceptions
// Interoperability Constructor already calls DeviceRetain in
Expand Down
4 changes: 3 additions & 1 deletion sycl/source/detail/device_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,14 +226,16 @@ class device_impl {

bool isAssertFailSupported() const;

bool isRootDevice() const { return MRootDevice == nullptr; }

std::string getDeviceName() const;

private:
explicit device_impl(pi_native_handle InteropDevice, RT::PiDevice Device,
PlatformImplPtr Platform, const plugin &Plugin);
RT::PiDevice MDevice = 0;
RT::PiDeviceType MType;
bool MIsRootDevice = false;
RT::PiDevice MRootDevice = nullptr;
bool MIsHostDevice;
PlatformImplPtr MPlatform;
bool MIsAssertFailSupported = false;
Expand Down
10 changes: 8 additions & 2 deletions sycl/source/detail/persistent_device_code_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,13 @@ void PersistentDeviceCodeCache::putItemToDisc(
const SerializedObj &SpecConsts, const std::string &BuildOptionsString,
const RT::PiProgram &NativePrg) {

if (!isImageCached(Img))
return;

std::string DirName =
getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString);

if (!isImageCached(Img) || DirName.empty())
if (DirName.empty())
return;

auto Plugin = detail::getSyclObjImpl(Device)->getPlugin();
Expand Down Expand Up @@ -137,10 +140,13 @@ std::vector<std::vector<char>> PersistentDeviceCodeCache::getItemFromDisc(
const device &Device, const RTDeviceBinaryImage &Img,
const SerializedObj &SpecConsts, const std::string &BuildOptionsString) {

if (!isImageCached(Img))
return {};

std::string Path =
getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString);

if (!isImageCached(Img) || Path.empty() || !OSUtil::isPathPresent(Path))
if (Path.empty() || !OSUtil::isPathPresent(Path))
return {};

int i = 0;
Expand Down
36 changes: 33 additions & 3 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,10 +481,40 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
if (Prg)
Prg->stableSerializeSpecConstRegistry(SpecConsts);

auto BuildF = [this, &M, &KSId, &ContextImpl, &DeviceImpl, Prg, &CompileOpts,
// Check if root device architecture is homogeneous and we can optimize builds
// for sub-devices
DeviceImplPtr RootDevImpl = DeviceImpl;
while (!RootDevImpl->isRootDevice()) {
auto ParentDev = detail::getSyclObjImpl(
RootDevImpl->get_info<info::device::parent_device>());
if (!ContextImpl->hasDevice(ParentDev))
break;
RootDevImpl = ParentDev;
}

pi_bool IsRootDeviceArchHomogeneous = PI_FALSE;
ContextImpl->getPlugin().call<PiApiKind::piDeviceGetInfo>(
RootDevImpl->getHandleRef(), PI_DEVICE_INFO_HOMOGENEOUS_ARCH,
sizeof(pi_bool), &IsRootDeviceArchHomogeneous, nullptr);

// FIXME: the logic is modified to work around unintuitive Intel OpenCL CPU
// implementation behavior. Kernels created with the program built for root
// device can be re-used on sub-devices, but other combinations doesn't work
// (e.g. clGetKernelWorkGroupInfo returns CL_INVALID_KERNEL if kernel was
// created from the program built for sub-device and re-used either on root or
// other sub-device).
// To work around this case we optimize only one case: root device shares the
// same context with its sub-device(s). We built for the root device and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is "the same context" checked? What if context just has no root-device in it, only all of its sub-devices?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if context just has no root-device in it, only all of its sub-devices?

The optimization won't be enabled in such case.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, thanks. Maybe as a future optimization we could implicitly add the root-device to the context, if >1 of it's sub-devices are there already (such that we can save on 1+ module builds). If you agree, please consider adding a TODO comment.

Copy link
Contributor Author

@bader bader Feb 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if SYCL spec allows implicitly adding devices to the context implicitly created by the runtime, but I think it's not allowed if the context is provided by the user.

The latest patch removes following comment, which I was considering as direction for future optimizations: e6ca4f9#diff-78dd7f7ba0b6120dece1ae4ab5a09c9936ff654a1de2c31ff2dbb1fc58d90393L509-L511
I think it would be great if Level Zero allows us to re-use the program built for any (sub-)device and not only a root device. I tested it on Intel GPU and it works already, but again it's not guaranteed by the spec wording. In this case we don't need implicitly add the root-device to optimize the build for sub-devices.

@bashbaug, does it make sense to pursue this direction? If so, I can recover the comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can think of some cases in theory at least where a program built for one sub-device wouldn't be valid for a sibling sub-device, so this is not a safe assumption in all cases. If we decide this is a direction we want to pursue we'd need to find a way to detect or request this behavior.

// cache the results.
// The expected solution is to build for any sub-device and use root device
// handle as cache key to share build results for any other sub-device or even
// a root device.
DeviceImplPtr Dev =
(IsRootDeviceArchHomogeneous == PI_TRUE) ? RootDevImpl : DeviceImpl;
auto BuildF = [this, &M, &KSId, &ContextImpl, &Dev, Prg, &CompileOpts,
&LinkOpts, &JITCompilationIsRequired, SpecConsts] {
auto Context = createSyclObjFromImpl<context>(ContextImpl);
auto Device = createSyclObjFromImpl<device>(DeviceImpl);
auto Device = createSyclObjFromImpl<device>(Dev);

const RTDeviceBinaryImage &Img =
getDeviceImage(M, KSId, Context, Device, JITCompilationIsRequired);
Expand Down Expand Up @@ -536,7 +566,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(
return BuiltProgram.release();
};

const RT::PiDevice PiDevice = DeviceImpl->getHandleRef();
const RT::PiDevice PiDevice = Dev->getHandleRef();

auto BuildResult = getOrBuild<PiProgramT, compile_program_error>(
Cache,
Expand Down
6 changes: 3 additions & 3 deletions sycl/source/detail/program_manager/program_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ class ProgramManager {
SerializedObj SpecConsts);
/// Builds or retrieves from cache a program defining the kernel with given
/// name.
/// \param M idenfies the OS module the kernel comes from (multiple OS modules
/// may have kernels with the same name)
/// \param M identifies the OS module the kernel comes from (multiple OS
/// modules may have kernels with the same name)
/// \param Context the context to build the program with
/// \param Device the device for which the program is built
/// \param KernelName the kernel's name
Expand Down Expand Up @@ -153,7 +153,7 @@ class ProgramManager {
/// \param NativePrg the native program, target for spec constant setting; if
/// not null then overrides the native program in Prg
/// \param Img A source of the information about which constants need
/// setting and symboling->integer spec constnant ID mapping. If not
/// setting and symboling->integer spec constant ID mapping. If not
/// null, overrides native program->binary image binding maintained by
/// the program manager.
void flushSpecConstants(const program_impl &Prg,
Expand Down
1 change: 1 addition & 0 deletions sycl/unittests/program_manager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ add_sycl_unittest(ProgramManagerTests OBJECT
BuildLog.cpp
EliminatedArgMask.cpp
itt_annotations.cpp
SubDevices.cpp
)

155 changes: 155 additions & 0 deletions sycl/unittests/program_manager/SubDevices.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl/program.hpp>
#include <detail/kernel_bundle_impl.hpp>

#include <helpers/CommonRedefinitions.hpp>
#include <helpers/PiImage.hpp>
#include <helpers/PiMock.hpp>

#include <gtest/gtest.h>

#include <helpers/TestKernel.hpp>

static pi_device rootDevice;
static pi_device piSubDev1 = (pi_device)0x1;
static pi_device piSubDev2 = (pi_device)0x2;

namespace {
pi_result redefinedDeviceGetInfo(pi_device device, pi_device_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
if (param_name == PI_DEVICE_INFO_PARTITION_PROPERTIES) {
if (!param_value) {
*param_value_size_ret = 2 * sizeof(pi_device_partition_property);
} else {
((pi_device_partition_property *)param_value)[0] =
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
((pi_device_partition_property *)param_value)[1] =
PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
}
}
if (param_name == PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) {
if (!param_value) {
*param_value_size_ret = sizeof(pi_device_affinity_domain);
} else {
((pi_device_affinity_domain *)param_value)[0] =
PI_DEVICE_AFFINITY_DOMAIN_NUMA |
PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE;
}
}
if (param_name == PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) {
((pi_uint32 *)param_value)[0] = 2;
}
if (param_name == PI_DEVICE_INFO_PARENT_DEVICE) {
if (device == piSubDev1 || device == piSubDev2)
((pi_device *)param_value)[0] = rootDevice;
else
((pi_device *)param_value)[0] = nullptr;
}
return PI_SUCCESS;
}

pi_result redefinedDevicePartition(
pi_device Device, const pi_device_partition_property *Properties,
pi_uint32 NumDevices, pi_device *OutDevices, pi_uint32 *OutNumDevices) {
if (OutNumDevices)
*OutNumDevices = 2;
if (OutDevices) {
OutDevices[0] = {};
OutDevices[1] = {};
}
return PI_SUCCESS;
}

pi_result redefinedDeviceRetain(pi_device c) { return PI_SUCCESS; }

pi_result redefinedDeviceRelease(pi_device c) { return PI_SUCCESS; }

pi_result redefinedProgramBuild(
pi_program prog, pi_uint32, const pi_device *, const char *,
void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
static int m = 0;
m++;
// if called more than once return an error
if (m > 1)
return PI_ERROR_UNKNOWN;

return PI_SUCCESS;
}

pi_result redefinedContextCreate(const pi_context_properties *Properties,
pi_uint32 NumDevices, const pi_device *Devices,
void (*PFnNotify)(const char *ErrInfo,
const void *PrivateInfo,
size_t CB, void *UserData),
void *UserData, pi_context *RetContext) {
return PI_SUCCESS;
}
} // anonymous namespace

// Check that program is built once for all sub-devices
// FIXME: mock 3 devices (one root device + two sub-devices) within a single
// context.
TEST(SubDevices, DISABLED_BuildProgramForSubdevices) {
sycl::platform Plt{sycl::default_selector()};
// Host devices do not support sub-devices
if (Plt.is_host() || Plt.get_backend() == sycl::backend::ext_oneapi_cuda ||
Plt.get_backend() == sycl::backend::ext_oneapi_hip) {
std::cerr << "Test is not supported on "
<< Plt.get_info<sycl::info::platform::name>() << ", skipping\n";
GTEST_SKIP(); // test is not supported on selected platform.
}

// Setup Mock APIs
sycl::unittest::PiMock Mock{Plt};
setupDefaultMockAPIs(Mock);
Mock.redefine<sycl::detail::PiApiKind::piDeviceGetInfo>(
redefinedDeviceGetInfo);
Mock.redefine<sycl::detail::PiApiKind::piDevicePartition>(
redefinedDevicePartition);
Mock.redefine<sycl::detail::PiApiKind::piDeviceRetain>(redefinedDeviceRetain);
Mock.redefine<sycl::detail::PiApiKind::piDeviceRelease>(
redefinedDeviceRelease);
Mock.redefine<sycl::detail::PiApiKind::piProgramBuild>(redefinedProgramBuild);
Mock.redefine<sycl::detail::PiApiKind::piContextCreate>(
redefinedContextCreate);

// Create 2 sub-devices and use first platform device as a root device
const sycl::device device = Plt.get_devices()[0];
// Initialize root device
rootDevice = sycl::detail::getSyclObjImpl(device)->getHandleRef();
// Initialize sub-devices
auto PltImpl = sycl::detail::getSyclObjImpl(Plt);
auto subDev1 =
std::make_shared<sycl::detail::device_impl>(piSubDev1, PltImpl);
auto subDev2 =
std::make_shared<sycl::detail::device_impl>(piSubDev2, PltImpl);
sycl::context Ctx{
{device, sycl::detail::createSyclObjFromImpl<sycl::device>(subDev1),
sycl::detail::createSyclObjFromImpl<sycl::device>(subDev2)}};

// Create device binary description structures for getBuiltPIProgram API.
auto devBin = Img.convertToNativeType();
pi_device_binaries_struct devBinStruct{PI_DEVICE_BINARIES_VERSION, 1,
&devBin};
sycl::detail::ProgramManager::getInstance().addImages(&devBinStruct);

// Build program via getBuiltPIProgram API
sycl::detail::ProgramManager::getInstance().getBuiltPIProgram(
sycl::detail::OSUtil::getOSModuleHandle(&devBin),
sycl::detail::getSyclObjImpl(Ctx), subDev1,
sycl::detail::KernelInfo<TestKernel>::getName());
// This call should re-use built binary from the cache. If piProgramBuild is
// called again, the test will fail as second call of redefinedProgramBuild
sycl::detail::ProgramManager::getInstance().getBuiltPIProgram(
sycl::detail::OSUtil::getOSModuleHandle(&devBin),
sycl::detail::getSyclObjImpl(Ctx), subDev2,
sycl::detail::KernelInfo<TestKernel>::getName());
}