-
Notifications
You must be signed in to change notification settings - Fork 809
[SYCL] Do not build device code for sub-devices #5240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
cd9818b
04e3869
ba29bbe
f5b380b
5a3587e
28b7f80
61e09bd
d5b93f0
a1e483a
7ac48ae
d0f2861
231a1a3
8f2d9c4
d44e27f
6e310b0
ce299cd
e6ca4f9
bf57926
d062d77
0e650ea
d1cc7aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -203,7 +203,13 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, | |
| std::memcpy(paramValue, &result, sizeof(cl_bool)); | ||
| return PI_SUCCESS; | ||
| } | ||
|
|
||
| case PI_DEVICE_INFO_HOMOGENEOUS_ARCH: { | ||
| // FIXME: conservatively return false due to lack of low-level API exposing | ||
|
||
| // actual status of this property | ||
| cl_bool result = false; | ||
| std::memcpy(paramValue, &result, sizeof(cl_bool)); | ||
| return PI_SUCCESS; | ||
| } | ||
| case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: | ||
| // Returns the maximum sizes of a work group for each dimension one | ||
| // could use to submit a kernel. There is no such query defined in OpenCL | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -481,10 +481,40 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( | |
| if (Prg) | ||
| Prg->stableSerializeSpecConstRegistry(SpecConsts); | ||
|
|
||
| auto BuildF = [this, &M, &KSId, &ContextImpl, &DeviceImpl, Prg, &CompileOpts, | ||
| // Check if root device architecture is homogeneous and we can optimize builds | ||
| // for sub-devices | ||
| DeviceImplPtr RootDevImpl = DeviceImpl; | ||
| while (!RootDevImpl->isRootDevice()) { | ||
| auto ParentDev = detail::getSyclObjImpl( | ||
| RootDevImpl->get_info<info::device::parent_device>()); | ||
| if (!ContextImpl->hasDevice(ParentDev)) | ||
| break; | ||
| RootDevImpl = ParentDev; | ||
| } | ||
|
|
||
| pi_bool IsRootDeviceArchHomogeneous = PI_FALSE; | ||
| ContextImpl->getPlugin().call<PiApiKind::piDeviceGetInfo>( | ||
| RootDevImpl->getHandleRef(), PI_DEVICE_INFO_HOMOGENEOUS_ARCH, | ||
| sizeof(pi_bool), &IsRootDeviceArchHomogeneous, nullptr); | ||
|
|
||
| // FIXME: the logic is modified to work around unintuitive Intel OpenCL CPU | ||
| // implementation behavior. Kernels created with the program built for root | ||
| // device can be re-used on sub-devices, but other combinations doesn't work | ||
| // (e.g. clGetKernelWorkGroupInfo returns CL_INVALID_KERNEL if kernel was | ||
| // created from the program built for sub-device and re-used either on root or | ||
| // other sub-device). | ||
| // To work around this case we optimize only one case: root device shares the | ||
| // same context with its sub-device(s). We built for the root device and | ||
|
||
| // cache the results. | ||
| // The expected solution is to build for any sub-device and use root device | ||
| // handle as cache key to share build results for any other sub-device or even | ||
| // a root device. | ||
| DeviceImplPtr Dev = | ||
| (IsRootDeviceArchHomogeneous == PI_TRUE) ? RootDevImpl : DeviceImpl; | ||
| auto BuildF = [this, &M, &KSId, &ContextImpl, &Dev, Prg, &CompileOpts, | ||
| &LinkOpts, &JITCompilationIsRequired, SpecConsts] { | ||
| auto Context = createSyclObjFromImpl<context>(ContextImpl); | ||
| auto Device = createSyclObjFromImpl<device>(DeviceImpl); | ||
| auto Device = createSyclObjFromImpl<device>(Dev); | ||
|
|
||
| const RTDeviceBinaryImage &Img = | ||
| getDeviceImage(M, KSId, Context, Device, JITCompilationIsRequired); | ||
|
|
@@ -536,7 +566,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( | |
| return BuiltProgram.release(); | ||
| }; | ||
|
|
||
| const RT::PiDevice PiDevice = DeviceImpl->getHandleRef(); | ||
| const RT::PiDevice PiDevice = Dev->getHandleRef(); | ||
|
|
||
| auto BuildResult = getOrBuild<PiProgramT, compile_program_error>( | ||
| Cache, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,155 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include <CL/sycl/program.hpp> | ||
| #include <detail/kernel_bundle_impl.hpp> | ||
|
|
||
| #include <helpers/CommonRedefinitions.hpp> | ||
| #include <helpers/PiImage.hpp> | ||
| #include <helpers/PiMock.hpp> | ||
|
|
||
| #include <gtest/gtest.h> | ||
|
|
||
| #include <helpers/TestKernel.hpp> | ||
|
|
||
| static pi_device rootDevice; | ||
| static pi_device piSubDev1 = (pi_device)0x1; | ||
| static pi_device piSubDev2 = (pi_device)0x2; | ||
|
|
||
| namespace { | ||
| pi_result redefinedDeviceGetInfo(pi_device device, pi_device_info param_name, | ||
| size_t param_value_size, void *param_value, | ||
| size_t *param_value_size_ret) { | ||
| if (param_name == PI_DEVICE_INFO_PARTITION_PROPERTIES) { | ||
| if (!param_value) { | ||
| *param_value_size_ret = 2 * sizeof(pi_device_partition_property); | ||
| } else { | ||
| ((pi_device_partition_property *)param_value)[0] = | ||
| PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; | ||
| ((pi_device_partition_property *)param_value)[1] = | ||
| PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; | ||
| } | ||
| } | ||
| if (param_name == PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) { | ||
| if (!param_value) { | ||
| *param_value_size_ret = sizeof(pi_device_affinity_domain); | ||
| } else { | ||
| ((pi_device_affinity_domain *)param_value)[0] = | ||
| PI_DEVICE_AFFINITY_DOMAIN_NUMA | | ||
| PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE; | ||
| } | ||
| } | ||
| if (param_name == PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) { | ||
| ((pi_uint32 *)param_value)[0] = 2; | ||
| } | ||
| if (param_name == PI_DEVICE_INFO_PARENT_DEVICE) { | ||
| if (device == piSubDev1 || device == piSubDev2) | ||
| ((pi_device *)param_value)[0] = rootDevice; | ||
| else | ||
| ((pi_device *)param_value)[0] = nullptr; | ||
| } | ||
| return PI_SUCCESS; | ||
| } | ||
|
|
||
| pi_result redefinedDevicePartition( | ||
| pi_device Device, const pi_device_partition_property *Properties, | ||
| pi_uint32 NumDevices, pi_device *OutDevices, pi_uint32 *OutNumDevices) { | ||
| if (OutNumDevices) | ||
| *OutNumDevices = 2; | ||
| if (OutDevices) { | ||
| OutDevices[0] = {}; | ||
| OutDevices[1] = {}; | ||
| } | ||
| return PI_SUCCESS; | ||
| } | ||
|
|
||
| pi_result redefinedDeviceRetain(pi_device c) { return PI_SUCCESS; } | ||
|
|
||
| pi_result redefinedDeviceRelease(pi_device c) { return PI_SUCCESS; } | ||
|
|
||
| pi_result redefinedProgramBuild( | ||
| pi_program prog, pi_uint32, const pi_device *, const char *, | ||
| void (*pfn_notify)(pi_program program, void *user_data), void *user_data) { | ||
| static int m = 0; | ||
| m++; | ||
| // if called more than once return an error | ||
| if (m > 1) | ||
| return PI_ERROR_UNKNOWN; | ||
|
|
||
| return PI_SUCCESS; | ||
| } | ||
|
|
||
| pi_result redefinedContextCreate(const pi_context_properties *Properties, | ||
| pi_uint32 NumDevices, const pi_device *Devices, | ||
| void (*PFnNotify)(const char *ErrInfo, | ||
| const void *PrivateInfo, | ||
| size_t CB, void *UserData), | ||
| void *UserData, pi_context *RetContext) { | ||
| return PI_SUCCESS; | ||
| } | ||
| } // anonymous namespace | ||
|
|
||
| // Check that program is built once for all sub-devices | ||
| // FIXME: mock 3 devices (one root device + two sub-devices) within a single | ||
| // context. | ||
| TEST(SubDevices, DISABLED_BuildProgramForSubdevices) { | ||
| sycl::platform Plt{sycl::default_selector()}; | ||
| // Host devices do not support sub-devices | ||
| if (Plt.is_host() || Plt.get_backend() == sycl::backend::ext_oneapi_cuda || | ||
| Plt.get_backend() == sycl::backend::ext_oneapi_hip) { | ||
| std::cerr << "Test is not supported on " | ||
| << Plt.get_info<sycl::info::platform::name>() << ", skipping\n"; | ||
| GTEST_SKIP(); // test is not supported on selected platform. | ||
| } | ||
|
|
||
| // Setup Mock APIs | ||
| sycl::unittest::PiMock Mock{Plt}; | ||
| setupDefaultMockAPIs(Mock); | ||
| Mock.redefine<sycl::detail::PiApiKind::piDeviceGetInfo>( | ||
| redefinedDeviceGetInfo); | ||
| Mock.redefine<sycl::detail::PiApiKind::piDevicePartition>( | ||
| redefinedDevicePartition); | ||
| Mock.redefine<sycl::detail::PiApiKind::piDeviceRetain>(redefinedDeviceRetain); | ||
| Mock.redefine<sycl::detail::PiApiKind::piDeviceRelease>( | ||
| redefinedDeviceRelease); | ||
| Mock.redefine<sycl::detail::PiApiKind::piProgramBuild>(redefinedProgramBuild); | ||
| Mock.redefine<sycl::detail::PiApiKind::piContextCreate>( | ||
| redefinedContextCreate); | ||
|
|
||
| // Create 2 sub-devices and use first platform device as a root device | ||
| const sycl::device device = Plt.get_devices()[0]; | ||
| // Initialize root device | ||
| rootDevice = sycl::detail::getSyclObjImpl(device)->getHandleRef(); | ||
| // Initialize sub-devices | ||
| auto PltImpl = sycl::detail::getSyclObjImpl(Plt); | ||
| auto subDev1 = | ||
| std::make_shared<sycl::detail::device_impl>(piSubDev1, PltImpl); | ||
| auto subDev2 = | ||
| std::make_shared<sycl::detail::device_impl>(piSubDev2, PltImpl); | ||
| sycl::context Ctx{ | ||
| {device, sycl::detail::createSyclObjFromImpl<sycl::device>(subDev1), | ||
| sycl::detail::createSyclObjFromImpl<sycl::device>(subDev2)}}; | ||
|
|
||
| // Create device binary description structures for getBuiltPIProgram API. | ||
| auto devBin = Img.convertToNativeType(); | ||
| pi_device_binaries_struct devBinStruct{PI_DEVICE_BINARIES_VERSION, 1, | ||
| &devBin}; | ||
| sycl::detail::ProgramManager::getInstance().addImages(&devBinStruct); | ||
|
|
||
| // Build program via getBuiltPIProgram API | ||
| sycl::detail::ProgramManager::getInstance().getBuiltPIProgram( | ||
| sycl::detail::OSUtil::getOSModuleHandle(&devBin), | ||
| sycl::detail::getSyclObjImpl(Ctx), subDev1, | ||
| sycl::detail::KernelInfo<TestKernel>::getName()); | ||
| // This call should re-use built binary from the cache. If piProgramBuild is | ||
| // called again, the test will fail as second call of redefinedProgramBuild | ||
| sycl::detail::ProgramManager::getInstance().getBuiltPIProgram( | ||
| sycl::detail::OSUtil::getOSModuleHandle(&devBin), | ||
| sycl::detail::getSyclObjImpl(Ctx), subDev2, | ||
| sycl::detail::KernelInfo<TestKernel>::getName()); | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.