Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,18 @@ The load and store sub-group functions enable developers to assert that all work
|===
|Function|Description

|+template <typename T> T load(sub_group sg, const T *src)+
|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior.

|+template <typename T, access::address_space Space> T load(sub_group sg, const multi_ptr<T,Space> src)+
|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

|+template <int N, typename T, access::address_space Space> vec<T,N> load(sub_group sg, const multi_ptr<T,Space> src)+
|Load contiguous data from _src_. Returns _N_ elements per work-item, corresponding to the _N_ memory locations at _src_ + +i+ * +get_max_local_range()+ + +get_local_id()+ for +i+ between 0 and _N_. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

|+template <typename T> void store(sub_group sg, T *dst, const T& x)+
|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior.

|+template <typename T, access::address_space Space> void store(sub_group sg, multi_ptr<T,Space> dst, const T& x)+
|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+.

Expand All @@ -165,6 +171,7 @@ None.
|========================================
|Rev|Date|Author|Changes
|1|2020-03-16|John Pennycook|*Initial public working draft*
|2|2021-02-26|Vladimir Lazarev|*Add load/store method for raw pointers*
|========================================

//************************************************************************
Expand Down
24 changes: 24 additions & 0 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,30 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min)
__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max)

extern SYCL_EXTERNAL __attribute__((opencl_global)) void *
__spirv_GenericCastToPtrExplicit_ToGlobal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept;

extern SYCL_EXTERNAL __attribute__((opencl_local)) void *
__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept;

template <typename dataT>
extern __attribute__((opencl_global)) dataT *
__spirv_GenericCastToPtrExplicit_ToGlobal(
const void *Ptr, __spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_global))
dataT *)__spirv_GenericCastToPtrExplicit_ToGlobal(Ptr, S);
}

template <typename dataT>
extern __attribute__((opencl_local)) dataT *
__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr,
__spv::StorageClass::Flag S) noexcept {
return (__attribute__((opencl_local))
dataT *)__spirv_GenericCastToPtrExplicit_ToLocal(Ptr, S);
}

template <typename dataT>
__SYCL_CONVERGENT__ extern SYCL_EXTERNAL dataT
__spirv_SubgroupShuffleINTEL(dataT Data, uint32_t InvocationId) noexcept;
Expand Down
40 changes: 40 additions & 0 deletions sycl/include/CL/__spirv/spirv_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,46 @@ struct Scope {
Flag flag_value;
};

struct StorageClass {
enum Flag : uint32_t {
UniformConstant = 0,
Input = 1,
Uniform = 2,
Output = 3,
Workgroup = 4,
CrossWorkgroup = 5,
Private = 6,
Function = 7,
Generic = 8,
PushConstant = 9,
AtomicCounter = 10,
Image = 11,
StorageBuffer = 12,
CallableDataKHR = 5328,
CallableDataNV = 5328,
IncomingCallableDataKHR = 5329,
IncomingCallableDataNV = 5329,
RayPayloadKHR = 5338,
RayPayloadNV = 5338,
HitAttributeKHR = 5339,
HitAttributeNV = 5339,
IncomingRayPayloadKHR = 5342,
IncomingRayPayloadNV = 5342,
ShaderRecordBufferKHR = 5343,
ShaderRecordBufferNV = 5343,
PhysicalStorageBuffer = 5349,
PhysicalStorageBufferEXT = 5349,
CodeSectionINTEL = 5605,
CapabilityUSMStorageClassesINTEL = 5935,
DeviceOnlyINTEL = 5936,
HostOnlyINTEL = 5937,
Max = 0x7fffffff,
};
constexpr StorageClass(Flag flag) : flag_value(flag) {}
constexpr operator uint32_t() const { return flag_value; }
Flag flag_value;
};

struct MemorySemanticsMask {

enum Flag : uint32_t {
Expand Down
90 changes: 90 additions & 0 deletions sycl/include/CL/sycl/ONEAPI/sub_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,47 @@ struct sub_group {

/* --- sub_group load/stores --- */
/* these can map to SIMD or block read/write hardware where available */
#ifdef __SYCL_DEVICE_ONLY__
// Method for decorated pointer
template <typename T>
detail::enable_if_t<
!std::is_same<typename detail::remove_AS<T>::type, T>::value, T>
load(T *src) const {
return load(sycl::multi_ptr<typename detail::remove_AS<T>::type,
sycl::detail::deduce_AS<T>::value>(
(typename detail::remove_AS<T>::type *)src));
}

// Method for raw pointer
template <typename T>
detail::enable_if_t<
std::is_same<typename detail::remove_AS<T>::type, T>::value, T>
load(T *src) const {

#ifdef __NVPTX__
return src[get_local_id()[0]];
#else // __NVPTX__
auto l = __spirv_GenericCastToPtrExplicit_ToLocal<T>(
src, __spv::StorageClass::Workgroup);
if (l)
return load(l);

auto g = __spirv_GenericCastToPtrExplicit_ToGlobal<T>(
src, __spv::StorageClass::CrossWorkgroup);
if (g)
return load(g);

assert(!"Sub-group load() is supported for local or global pointers only.");
return 0;
#endif // __NVPTX__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> T load(T *src) const {
(void)src;
throw runtime_error("Sub-groups are not supported on host device.",
PI_INVALID_DEVICE);
}
#endif //__SYCL_DEVICE_ONLY__

template <typename T, access::address_space Space>
sycl::detail::enable_if_t<
Expand Down Expand Up @@ -315,6 +356,55 @@ struct sub_group {
#endif
}

#ifdef __SYCL_DEVICE_ONLY__
// Method for decorated pointer
template <typename T>
detail::enable_if_t<
!std::is_same<typename detail::remove_AS<T>::type, T>::value>
store(T *dst, const typename detail::remove_AS<T>::type &x) const {
store(sycl::multi_ptr<typename detail::remove_AS<T>::type,
sycl::detail::deduce_AS<T>::value>(
(typename detail::remove_AS<T>::type *)dst),
x);
}

// Method for raw pointer
template <typename T>
detail::enable_if_t<
std::is_same<typename detail::remove_AS<T>::type, T>::value>
store(T *dst, const typename detail::remove_AS<T>::type &x) const {

#ifdef __NVPTX__
dst[get_local_id()[0]] = x;
#else // __NVPTX__
auto l = __spirv_GenericCastToPtrExplicit_ToLocal<T>(
dst, __spv::StorageClass::Workgroup);
if (l) {
store(l, x);
return;
}

auto g = __spirv_GenericCastToPtrExplicit_ToGlobal<T>(
dst, __spv::StorageClass::CrossWorkgroup);
if (g) {
store(g, x);
return;
}

assert(
!"Sub-group store() is supported for local or global pointers only.");
return;
#endif // __NVPTX__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> void store(T *dst, const T &x) const {
(void)dst;
(void)x;
throw runtime_error("Sub-groups are not supported on host device.",
PI_INVALID_DEVICE);
}
#endif //__SYCL_DEVICE_ONLY__

template <typename T, access::address_space Space>
sycl::detail::enable_if_t<
sycl::detail::sub_group::AcceptableForGlobalLoadStore<T, Space>::value>
Expand Down
51 changes: 34 additions & 17 deletions sycl/include/CL/sycl/access/access.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,17 +187,15 @@ template <typename ElementType>
struct DecoratedType<ElementType, access::address_space::local_space> {
using type = __OPENCL_LOCAL_AS__ ElementType;
};
template <class T> struct remove_AS { typedef T type; };

template <class T>
struct remove_AS {
typedef T type;
template <class T> struct deduce_AS {
static const access::address_space value =
access::address_space::global_space;
};

#ifdef __SYCL_DEVICE_ONLY__
template <class T>
struct remove_AS<__OPENCL_GLOBAL_AS__ T> {
typedef T type;
};
template <class T> struct remove_AS<__OPENCL_GLOBAL_AS__ T> { typedef T type; };

#ifdef __ENABLE_USM_ADDR_SPACE__
template <class T> struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
Expand All @@ -207,21 +205,40 @@ template <class T> struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
template <class T> struct remove_AS<__OPENCL_GLOBAL_HOST_AS__ T> {
typedef T type;
};

template <class T> struct deduce_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> {
static const access::address_space value =
access::address_space::global_device_space;
};

template <class T> struct deduce_AS<__OPENCL_GLOBAL_HOST_AS__ T> {
static const access::address_space value =
access::address_space::global_host_space;
};
#endif // __ENABLE_USM_ADDR_SPACE__

template <class T>
struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
template <class T> struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
typedef T type;
};

template <class T>
struct remove_AS<__OPENCL_LOCAL_AS__ T> {
template <class T> struct remove_AS<__OPENCL_LOCAL_AS__ T> { typedef T type; };

template <class T> struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
typedef T type;
};

template <class T>
struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
typedef T type;
template <class T> struct deduce_AS<__OPENCL_PRIVATE_AS__ T> {
static const access::address_space value =
access::address_space::private_space;
};

template <class T> struct deduce_AS<__OPENCL_LOCAL_AS__ T> {
static const access::address_space value = access::address_space::local_space;
};

template <class T> struct deduce_AS<__OPENCL_CONSTANT_AS__ T> {
static const access::address_space value =
access::address_space::constant_space;
};
#endif

Expand All @@ -231,8 +248,8 @@ struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
#undef __OPENCL_LOCAL_AS__
#undef __OPENCL_CONSTANT_AS__
#undef __OPENCL_PRIVATE_AS__

#undef __OPENCL_GENERIC_AS__
} // namespace detail

} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
2 changes: 1 addition & 1 deletion sycl/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ set_target_properties(check-sycl-deploy PROPERTIES FOLDER "SYCL tests")
add_lit_testsuite(check-sycl-spirv "Running device-agnostic SYCL regression tests for SPIR-V"
${CMAKE_CURRENT_BINARY_DIR}
ARGS ${RT_TEST_ARGS}
PARAMS "SYCL_TRIPLE=spir64-unknown-linux-sycldevice"
PARAMS "SYCL_TRIPLE=spir64-unknown-unknown-sycldevice"
DEPENDS ${SYCL_TEST_DEPS}
EXCLUDE_FROM_CHECK_ALL
)
Expand Down
88 changes: 88 additions & 0 deletions sycl/test/extensions/sub_group_as.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -O3 -S -emit-llvm -x c++ %s -o - | FileCheck %s

#include <CL/sycl.hpp>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

int main(int argc, char *argv[]) {
cl::sycl::queue queue;
printf("Device Name = %s\n",
queue.get_device().get_info<cl::sycl::info::device::name>().c_str());

// Initialize some host memory
constexpr int N = 64;
int host_mem[N];
for (int i = 0; i < N; ++i) {
host_mem[i] = i * 100;
}

// Use the device to transform each value
{
cl::sycl::buffer<int, 1> buf(host_mem, N);
queue.submit([&](cl::sycl::handler &cgh) {
auto global =
buf.get_access<cl::sycl::access::mode::read_write,
cl::sycl::access::target::global_buffer>(cgh);
sycl::accessor<int, 1, sycl::access::mode::read_write,
sycl::access::target::local>
local(N, cgh);

cgh.parallel_for<class test>(
cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) {
int v[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
cl::sycl::ONEAPI::sub_group sg = it.get_sub_group();
if (!it.get_local_id(0)) {
int end = it.get_global_id(0) + it.get_local_range()[0];
for (int i = it.get_global_id(0); i < end; i++) {
local[i] = i;
}
}
// CHECK: call void @_Z22__spirv_ControlBarrierjjj
it.barrier();

int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) *
sg.get_max_local_range()[0];

// load for global address space
// CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv()
// CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)*
// CHECK: call spir_func void {{.*}}assert
auto x = sg.load(&global[i]);

// load() for local address space
// CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv()
// CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)*
// CHECK: call spir_func void {{.*}}assert
auto y = sg.load(&local[i]);

// load() for private address space
// CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv()
// CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)*
// CHECK: call spir_func void {{.*}}assert
auto z = sg.load(v + i);

// store() for global address space
// CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv()
// CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)*
// CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)*
// CHECK: call spir_func void {{.*}}assert
sg.store(&global[i], x + y + z);
});
});
}

return 0;
}
2 changes: 1 addition & 1 deletion sycl/test/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@

# Every SYCL implementation provides a host implementation.
config.available_features.add('host')
triple=lit_config.params.get('SYCL_TRIPLE', 'spir64-unknown-linux-sycldevice')
triple=lit_config.params.get('SYCL_TRIPLE', 'spir64-unknown-unknown-sycldevice')
lit_config.note("Triple: {}".format(triple))
config.substitutions.append( ('%sycl_triple', triple ) )

Expand Down
Loading