From fc98c22d13c5a5220822c5fb2b795d6f307b10f3 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 29 Aug 2023 08:10:49 -0700 Subject: [PATCH 01/13] [SYCL][matrix] Update the query interface with the latest joint matrix approved syntax --- ...cl_ext_oneapi_device_architecture.asciidoc | 1 + .../experimental/device_architecture.hpp | 1 + .../ext/oneapi/matrix/static-query-use.hpp | 647 +++++++++--------- .../Matrix/joint_matrix_query_default.cpp | 5 +- sycl/test/matrix/query-use.cpp | 212 +++--- sycl/test/matrix/runtime-query.cpp | 50 ++ 6 files changed, 463 insertions(+), 453 deletions(-) create mode 100644 sycl/test/matrix/runtime-query.cpp diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index 6c7bc2cce84b..cd8f878f56f7 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -99,6 +99,7 @@ namespace sycl::ext::oneapi::experimental { enum class architecture : /* unspecified */ { x86_64, + intel_cpu_spr, intel_gpu_bdw, intel_gpu_skl, intel_gpu_kbl, diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp index 5eb0a301ce4b..624ae83897e4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp @@ -14,6 +14,7 @@ namespace ext::oneapi::experimental { enum class architecture { x86_64, + intel_cpu_spr, intel_gpu_bdw, intel_gpu_skl, intel_gpu_kbl, diff --git a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp index f30ff53bb8a5..a3db6eca3ec3 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp @@ -35,40 +35,42 @@ namespace ext { namespace oneapi { namespace experimental::matrix { -enum class tpu { - xmx8, - xmx16, - amx, -}; enum class matrix_type { - bf8, bf16, fp16, tf32, fp32, fp64, - sint2, - sint4, sint8, sint16, sint32, sint64, - uint2, - uint4, uint8, uint16, uint32, uint64 }; -enum class scope_t { sub_group, work_group }; +struct combination { + size_t max_msize; + size_t max_nsize; + size_t max_ksize; + size_t msize; + size_t nsize; + size_t ksize; + matrix_type atype; + matrix_type btype; + matrix_type ctype; + matrix_type dtype; +}; -template -struct tpu_params; +template +struct matrix_params; template -constexpr bool is_combination_valid_amx(int sM, int sN, int sK) { +constexpr bool is_combination_valid_amx(size_t sM, size_t sN, size_t sK) { // is_same_v is a C++17 feature if ((std::is_same_v && std::is_same_v && std::is_same_v && sM <= 16 && sN <= 16 && sK <= 64) || @@ -104,46 +106,14 @@ constexpr bool are_types_valid_amx() { return false; } -// General query: -// types are not given, no default sizes and no implicit matrix construction -template -struct tpu_params { - static constexpr std::size_t M = -1; // depends on the type - static constexpr std::size_t N = -1; - static constexpr std::size_t K = -1; - - uint32_t numtiles = 8; - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); - struct combination { - uint32_t max_msize; - uint32_t max_nsize; - uint32_t max_ksize; - matrix_type atype; - matrix_type btype; - matrix_type accumulatortype; - uint32_t msize; - uint32_t nsize; - uint32_t ksize; - }; - using mt = matrix_type; - static constexpr combination combinations[] = { - {16, 16, 64, mt::sint8, mt::sint8, mt::sint32}, - {16, 16, 64, mt::sint8, mt::uint8, mt::sint32}, - {16, 16, 64, mt::uint8, mt::sint8, mt::sint32}, - {16, 16, 64, mt::uint8, mt::uint8, mt::sint32}, - {16, 16, 32, mt::bf16, mt::bf16, mt::fp32}}; - static constexpr int num_combinations = - sizeof(combinations) / sizeof(combination); -}; - -// Sizes-only query +// Default values query // Specialization for when only types are given, need to query only sizes -template -struct tpu_params && - !std::is_same_v && - !std::is_same_v)>::type> { +template +struct matrix_params< + architecture::intel_cpu_spr, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<(!std::is_same_v && + !std::is_same_v && + !std::is_same_v)>::type> { static_assert((are_types_valid_amx()), "Invalid types for AMX, supported types are int8_t, uint8_t, " "and bf16 (Note that unsigned short should be used in the" @@ -159,34 +129,17 @@ struct tpu_params using joint_matrix_b = joint_matrix; template - using joint_matrix_accumulator = - joint_matrix; - - uint32_t numtiles = 8; - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); - struct combination { - uint32_t max_msize; - uint32_t max_nsize; - uint32_t max_ksize; - matrix_type atype; - matrix_type btype; - matrix_type accumulatortype; - uint32_t msize; - uint32_t nsize; - uint32_t ksize; - }; - static constexpr combination combinations[] = { - {16, 16, (sizeof(Ta) == 1) ? 64 : 32}}; - static constexpr int num_combinations = - sizeof(combinations) / sizeof(combination); + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; -// Valid or not: +// Validation query // Specialization when both types and sizes are given -template -struct tpu_params< - tpu::amx, Ta, Tb, Tc, sM, sN, sK, +template +struct matrix_params< + architecture::intel_cpu_spr, Ta, Tb, Tc, Td, sM, sN, sK, typename std::enable_if<( !std::is_same_v && !std::is_same_v && !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { @@ -195,27 +148,24 @@ struct tpu_params< (sM == 0 && sN == 0 && sK == 0) || (is_combination_valid_amx(sM, sN, sK)), "Invalid parameters for AMX, query valid types and maximum sizes " - "using: tpu_params myparams; and then check out " + "using: matrix_params myparams; and then " + "check out " "myparams.combinations array"); // if combination is valid, construct the matrices - static constexpr std::size_t M = (sM != 0) ? sM : 16; - static constexpr std::size_t N = (sN != 0) ? sN : 16; - static constexpr std::size_t K = - (sK != 0) ? sK : ((sizeof(Ta) == 1) ? 64 : 32); + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; template using joint_matrix_a = joint_matrix; template using joint_matrix_b = joint_matrix; template - using joint_matrix_accumulator = - joint_matrix; - - uint32_t numtiles = 8; - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; // Intel XMX with SIMD8 capability @@ -224,25 +174,25 @@ struct tpu_params< // capabilities of the Intel XMX hardware. template -constexpr bool is_combination_valid_xmx8(int sM, int sN, int sK) { +constexpr bool is_combination_valid_xmx8(size_t sM, size_t sN, size_t sK) { if ((std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 8 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 8 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 8 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 8 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 8 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 8 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 8 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 8 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && - (sM == 1 || sM == 2 || sM == 4 || sM == 8) && sN == 8 && sK == 16) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 8 && + sK == 16) || (std::is_same_v && std::is_same_v && std::is_same_v && - (sM == 1 || sM == 2 || sM == 4 || sM == 8) && sN == 8 && sK == 16)) + (sM >= 1 && sM <= 8) && sN == 8 && sK == 16)) return true; else return false; @@ -267,72 +217,140 @@ constexpr bool are_types_valid_xmx8() { return false; } -// General Query -// specialization for when types are not given --> no default values -template -struct tpu_params { - static constexpr std::size_t M = -1; // depends on the type - static constexpr std::size_t N = -1; - static constexpr std::size_t K = -1; - - uint32_t numtiles = -1; // does not apply for XMX8 - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); - - struct combination { - uint32_t max_msize; - uint32_t max_nsize; - uint32_t max_ksize; - matrix_type atype; - matrix_type btype; - matrix_type accumulatortype; - uint32_t msize; - uint32_t nsize; - uint32_t ksize; - }; - using mt = matrix_type; - static constexpr combination combinations[] = { - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 1, 8, 32}, - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 2, 8, 32}, - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 4, 8, 32}, - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 8, 8, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 1, 8, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 2, 8, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 4, 8, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 8, 8, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 1, 8, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 2, 8, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 4, 8, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 8, 8, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 1, 8, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 2, 8, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 4, 8, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 8, 8, 32}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 1, 8, 16}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 2, 8, 16}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 4, 8, 16}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 8, 8, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 1, 8, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 2, 8, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 4, 8, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 8, 8, 16}, - }; - static constexpr int num_combinations = - sizeof(combinations) / sizeof(combination); +// Default-values query: +// Specialization for when only types are given, need to query only sizes + +template +struct matrix_params< + architecture::intel_gpu_dg1, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<(!std::is_same_v && + !std::is_same_v && + !std::is_same_v)>::type> { + static_assert((are_types_valid_xmx8()), + "Invalid types for architecture::intel_gpu_dg1, supported " + "types are int8_t, uint8_t, half, and bf16"); + + // construct the matrices using the default sizes + + static constexpr std::size_t M = 8; + static constexpr std::size_t N = 8; + static constexpr std::size_t K = ((sizeof(Ta) == 1) ? 32 : 16); + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Validation query: +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::intel_gpu_dg1, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { + // Validate that parameters are supported + static_assert( + (sM == 0 && sN == 0 && sK == 0) || + (is_combination_valid_xmx8(sM, sN, sK)), + "Invalid parameters for XMX8, query valid combinations " + "using: " + "q.get_device().get_info()"); + + // if combination is valid, construct the matrices + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; -// Sizes-only query: +// Default-values query: // Specialization for when only types are given, need to query only sizes -template -struct tpu_params && - !std::is_same_v && - !std::is_same_v)>::type> { +template +struct matrix_params< + architecture::intel_gpu_dg2_g10, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<(!std::is_same_v && + !std::is_same_v && + !std::is_same_v)>::type> { + static_assert((are_types_valid_xmx8()), + "Invalid types for architecture::intel_gpu_dg1, supported " + "types are int8_t, uint8_t, half, and bf16"); + + // construct the matrices using the default sizes + + static constexpr std::size_t M = 8; + static constexpr std::size_t N = 8; + static constexpr std::size_t K = ((sizeof(Ta) == 1) ? 32 : 16); + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Validation query: +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::intel_gpu_dg2_g10, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { + // Validate that parameters are supported + static_assert( + (sM == 0 && sN == 0 && sK == 0) || + (is_combination_valid_xmx8(sM, sN, sK)), + "Invalid parameters for XMX8, query valid combinations " + "using: " + "q.get_device().get_info()"); + + // if combination is valid, construct the matrices + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Default-values query: +// Specialization for when only types are given, need to query only sizes + +template +struct matrix_params< + architecture::intel_gpu_dg2_g11, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<(!std::is_same_v && + !std::is_same_v && + !std::is_same_v)>::type> { static_assert((are_types_valid_xmx8()), - "Invalid types for XMX8, supported types are int8_t, uint8_t, " - "half, and bf16 (Note that unsigned short should be used in the" - "DPC++ code to implement bf16)"); + "Invalid types for architecture::intel_gpu_dg1, supported " + "types are int8_t, uint8_t, half, and bf16"); // construct the matrices using the default sizes @@ -345,68 +363,102 @@ struct tpu_params using joint_matrix_b = joint_matrix; template - using joint_matrix_accumulator = - joint_matrix; - - uint32_t numtiles = -1; // does not apply for XMX8 - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); - struct combination { - uint32_t max_msize; - uint32_t max_nsize; - uint32_t max_ksize; - matrix_type atype; - matrix_type btype; - matrix_type accumulatortype; - uint32_t msize; - uint32_t nsize; - uint32_t ksize; - }; - using mt = matrix_type; - static constexpr combination combinations[] = { - // The types used in the initialization below are fake and not used. In - // this case, users already chose the types, they are only looking for - // the - // sizes - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 1, 8, (sizeof(Ta) == 1) ? 32 : 16}, - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 2, 8, (sizeof(Ta) == 1) ? 32 : 16}, - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 4, 8, (sizeof(Ta) == 1) ? 32 : 16}, - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 8, 8, (sizeof(Ta) == 1) ? 32 : 16}, - }; - static constexpr int num_combinations = - sizeof(combinations) / sizeof(combination); + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; -// Valid or not: +// Validation query: // Specialization when both types and sizes are given -template -struct tpu_params< - tpu::xmx8, Ta, Tb, Tc, sM, sN, sK, - typename std::enable_if<((!std::is_same_v && sM != 0))>::type> { +template +struct matrix_params< + architecture::intel_gpu_dg2_g11, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { // Validate that parameters are supported - static_assert((sM == 0 && sN == 0 && sK == 0) || - (is_combination_valid_xmx8(sM, sN, sK)), - "Invalid parameters for XMX8, query valid combinations " - "using: tpu_params myparams; and then check out " - "myparams.combinations array"); + static_assert( + (sM == 0 && sN == 0 && sK == 0) || + (is_combination_valid_xmx8(sM, sN, sK)), + "Invalid parameters for XMX8, query valid combinations " + "using: " + "q.get_device().get_info()"); // if combination is valid, construct the matrices - static constexpr std::size_t M = (sM != 0) ? sM : 8; - static constexpr std::size_t N = (sN != 0) ? sN : 8; - static constexpr std::size_t K = - (sK != 0) ? sK : ((sizeof(Ta) == 1) ? 32 : 16); + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; + +// Default-values query: +// Specialization for when only types are given, need to query only sizes + +template +struct matrix_params< + architecture::intel_gpu_dg2_g12, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<(!std::is_same_v && + !std::is_same_v && + !std::is_same_v)>::type> { + static_assert((are_types_valid_xmx8()), + "Invalid types for architecture::intel_gpu_dg1, supported " + "types are int8_t, uint8_t, half, and bf16"); + + // construct the matrices using the default sizes + + static constexpr std::size_t M = 8; + static constexpr std::size_t N = 8; + static constexpr std::size_t K = ((sizeof(Ta) == 1) ? 32 : 16); template using joint_matrix_a = joint_matrix; template using joint_matrix_b = joint_matrix; template - using joint_matrix_accumulator = - joint_matrix; + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; +}; - uint32_t numtiles = -1; // does not apply for XMX8 - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); +// Validation query: +// Specialization when both types and sizes are given +template +struct matrix_params< + architecture::intel_gpu_dg2_g12, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { + // Validate that parameters are supported + static_assert( + (sM == 0 && sN == 0 && sK == 0) || + (is_combination_valid_xmx8(sM, sN, sK)), + "Invalid parameters for XMX8, query valid combinations " + "using: " + "q.get_device().get_info()"); + + // if combination is valid, construct the matrices + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; + + template + using joint_matrix_a = joint_matrix; + template + using joint_matrix_b = joint_matrix; + template + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; // Intel XMX with SIMD16 capability @@ -415,25 +467,25 @@ struct tpu_params< // capabilities of the Intel XMX hardware. template -constexpr bool is_combination_valid_xmx16(int sM, int sN, int sK) { +constexpr bool is_combination_valid_xmx16(size_t sM, size_t sN, size_t sK) { if ((std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 16 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 16 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 16 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 16 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 16 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 16 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && (sM == 1 || sM == 2 || sM == 4 || sM == 8) && - sN == 16 && sK == 32) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 16 && + sK == 32) || (std::is_same_v && std::is_same_v && - std::is_same_v && - (sM == 1 || sM == 2 || sM == 4 || sM == 8) && sN == 16 && sK == 16) || + std::is_same_v && (sM >= 1 && sM <= 8) && sN == 16 && + sK == 16) || (std::is_same_v && std::is_same_v && std::is_same_v && - (sM == 1 || sM == 2 || sM == 4 || sM == 8) && sN == 16 && sK == 16)) + (sM >= 1 && sM <= 8) && sN == 16 && sK == 16)) return true; else return false; @@ -458,72 +510,19 @@ constexpr bool are_types_valid_xmx16() { return false; } -// General Query -// specialization for when types are not given --> no default values -template -struct tpu_params { - static constexpr std::size_t M = -1; // depends on the type - static constexpr std::size_t N = -1; - static constexpr std::size_t K = -1; - - uint32_t numtiles = -1; // does not apply for XMX - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); - - struct combination { - uint32_t max_msize; - uint32_t max_nsize; - uint32_t max_ksize; - matrix_type atype; - matrix_type btype; - matrix_type accumulatortype; - uint32_t msize; - uint32_t nsize; - uint32_t ksize; - }; - using mt = matrix_type; - static constexpr combination combinations[] = { - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 1, 16, 32}, - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 2, 16, 32}, - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 4, 16, 32}, - {0, 0, 0, mt::sint8, mt::sint8, mt::sint32, 8, 16, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 1, 16, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 2, 16, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 4, 16, 32}, - {0, 0, 0, mt::sint8, mt::uint8, mt::sint32, 8, 16, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 1, 16, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 2, 16, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 4, 16, 32}, - {0, 0, 0, mt::uint8, mt::sint8, mt::sint32, 8, 16, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 1, 16, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 2, 16, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 4, 16, 32}, - {0, 0, 0, mt::uint8, mt::uint8, mt::sint32, 8, 16, 32}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 1, 16, 16}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 2, 16, 16}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 4, 16, 16}, - {0, 0, 0, mt::fp16, mt::fp16, mt::fp32, 8, 16, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 1, 16, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 2, 16, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 4, 16, 16}, - {0, 0, 0, mt::bf16, mt::bf16, mt::fp32, 8, 16, 16}, - }; - static constexpr int num_combinations = - sizeof(combinations) / sizeof(combination); -}; - -// Sizes-only query: +// Default values query: // Specialization for when only types are given, need to query only sizes -template -struct tpu_params && - !std::is_same_v && - !std::is_same_v)>::type> { +template +struct matrix_params< + architecture::intel_gpu_pvc, Ta, Tb, Tc, Td, 0, 0, 0, + typename std::enable_if<(!std::is_same_v && + !std::is_same_v && + !std::is_same_v)>::type> { static_assert((are_types_valid_xmx16()), - "Invalid types for XMX16, supported types are int8_t, uint8_t, " - "half, and bf16 (Note that unsigned short should be used in the" - "DPC++ code to implement bf16)"); + "Invalid types for architecture::intel_gpu_pvc, supported " + "types are int8_t, uint8_t, " + "half, and bf16"); // construct the matrices using the default sizes @@ -536,68 +535,42 @@ struct tpu_params using joint_matrix_b = joint_matrix; template - using joint_matrix_accumulator = - joint_matrix; - - uint32_t numtiles = -1; // does not apply for XMX - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); - struct combination { - uint32_t max_msize; - uint32_t max_nsize; - uint32_t max_ksize; - matrix_type atype; - matrix_type btype; - matrix_type accumulatortype; - uint32_t msize; - uint32_t nsize; - uint32_t ksize; - }; - using mt = matrix_type; - static constexpr combination combinations[] = { - // The types used in the initialization below are fake and not used. In - // this case, users already chose the types, they are only looking for - // the - // sizes - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 1, 16, (sizeof(Ta) == 1) ? 32 : 16}, - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 2, 16, (sizeof(Ta) == 1) ? 32 : 16}, - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 4, 16, (sizeof(Ta) == 1) ? 32 : 16}, - {0, 0, 0, mt::bf8, mt::bf8, mt::bf8, 8, 16, (sizeof(Ta) == 1) ? 32 : 16}, - }; - static constexpr int num_combinations = - sizeof(combinations) / sizeof(combination); + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; -// Valid or not: +// Validation query: // Specialization when both types and sizes are given -template -struct tpu_params< - tpu::xmx16, Ta, Tb, Tc, sM, sN, sK, - typename std::enable_if<((!std::is_same_v && sM != 0))>::type> { +template +struct matrix_params< + architecture::intel_gpu_pvc, Ta, Tb, Tc, Td, sM, sN, sK, + typename std::enable_if<( + !std::is_same_v && !std::is_same_v && + !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { // Validate that parameters are supported - static_assert((sM == 0 && sN == 0 && sK == 0) || - (is_combination_valid_xmx16(sM, sN, sK)), - "Invalid parameters for XMX16, query valid combinations " - "using: tpu_params myparams; and then check out " - "myparams.combinations array"); + static_assert( + (sM == 0 && sN == 0 && sK == 0) || + (is_combination_valid_xmx16(sM, sN, sK)), + "Invalid parameters for architecture::intel_gpu_pvc, query valid " + "combinations " + "using: " + "q.get_device().get_info()"); // if combination is valid, construct the matrices - static constexpr std::size_t M = (sM != 0) ? sM : 8; - static constexpr std::size_t N = (sN != 0) ? sN : 8; - static constexpr std::size_t K = - (sK != 0) ? sK : ((sizeof(Ta) == 1) ? 32 : 16); + static constexpr std::size_t M = sM; + static constexpr std::size_t N = sN; + static constexpr std::size_t K = sK; template using joint_matrix_a = joint_matrix; template using joint_matrix_b = joint_matrix; template - using joint_matrix_accumulator = - joint_matrix; - - uint32_t numtiles = -1; // does not apply for XMX16 - static constexpr scope_t scopes[] = {scope_t::sub_group}; - static constexpr int num_scopes = sizeof(scopes) / sizeof(scope_t); + using joint_matrix_c = joint_matrix; + template + using joint_matrix_d = joint_matrix; }; } // namespace experimental::matrix } // namespace oneapi diff --git a/sycl/test-e2e/Matrix/joint_matrix_query_default.cpp b/sycl/test-e2e/Matrix/joint_matrix_query_default.cpp index 048aed6341f6..ccf50a5a76da 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_query_default.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_query_default.cpp @@ -39,7 +39,8 @@ void matrix_multiply(big_matrix &C, size_t K = NUM_COLS_A; assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); - using myparams2 = tpu_params; + using myparams2 = + matrix_params; constexpr int TM = myparams2::M; constexpr int TN = myparams2::N; constexpr int TK = myparams2::K; @@ -80,7 +81,7 @@ void matrix_multiply(big_matrix &C, myparams2::joint_matrix_b< sub_group, ext::intel::experimental::matrix::layout::packed> sub_b; - myparams2::joint_matrix_accumulator sub_c; + myparams2::joint_matrix_c sub_c; joint_matrix_load( sg, sub_c, diff --git a/sycl/test/matrix/query-use.cpp b/sycl/test/matrix/query-use.cpp index 9afc8e117304..e39361bbeead 100644 --- a/sycl/test/matrix/query-use.cpp +++ b/sycl/test/matrix/query-use.cpp @@ -1,162 +1,146 @@ -// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o query-use %s +// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o compile-time-query %s #include #include using namespace sycl; +using namespace sycl::ext::oneapi::experimental; using namespace sycl::ext::oneapi::experimental::matrix; -void query_amx() { +void query_amx_spr() { // generates combination assert - // using myparams = tpu_params; + // using myparams = matrix_params; // generates types assert - // using myparams2 = tpu_params; + // using myparams2 = matrix_params; // tells whether a combination is valid or not, if valid, those will be set as // default - using myparams = tpu_params; + using myparams = matrix_params; size_t dmsize = myparams::M; size_t dnsize = myparams::N; size_t dksize = myparams::K; - std::cout << "sizes of AMX tpu_params chosen by the user are: M " << dmsize + std::cout << "sizes of AMX matrix_params chosen by the user are: M " << dmsize << " N " << dnsize << " K " << dksize << std::endl; // Sizes-only query: types are given, generate default sizes - using myparams2 = tpu_params; + using myparams2 = + matrix_params; myparams2 p; dmsize = myparams2::M; dnsize = myparams2::N; dksize = myparams2::K; - std::cout << "default AMX sizes tpu_params are: M " << dmsize << " N " - << dnsize << " K " << dksize << "\n AMX int8 num combinations is " - << p.num_combinations << std::endl; - - // general query: types are not given - tpu_params myparams3; - - if (myparams3.num_scopes > 0) - if (myparams3.scopes[0] == scope_t::sub_group) - std::cout << "There are " << myparams3.num_scopes - << " Scopes that are supported by AMX implementation and " - "subgroup is one of them " - << std::endl; - - std::cout << "AMX query num combinations: " << myparams3.num_combinations - << std::endl; - - if (myparams3.combinations[0].msize != 0) // this is a max params hardware - return; - constexpr int msize = myparams3.combinations[0].max_msize; - constexpr int nsize = myparams3.combinations[0].max_nsize; - constexpr int ksize = myparams3.combinations[0].max_ksize; - std::cout << "AMX query sizes are: M " << msize << " N " << nsize << " K " - << ksize << std::endl; - - size_t NDRangeM = 1024 / msize; - size_t NDRangeN = 1024 / nsize; - queue q; - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN}, {1, 1}), - [msize, ksize, nsize](nd_item<2> spmd_item) { - sub_group sg = spmd_item.get_sub_group(); - myparams2::joint_matrix_a sub_a1; - myparams2::joint_matrix_b< - sub_group, sycl::ext::intel::experimental::matrix::layout::packed> - sub_b1; - myparams2::joint_matrix_accumulator sub_c1; - - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; - }); - }); + std::cout << "default AMX sizes matrix_params are: M " << dmsize << " N " + << dnsize << " K " << dksize << std::endl; + return; } -void query_xmx8() { +void query_xmx_dg() { // generates combination assert - // using myparams = tpu_params; + // using myparams = matrix_params; // generate combination of type assert - // using myparams = tpu_params; + // using myparams = matrix_params; // tells whether a combination is valid or not, if valid, those will be set as // default - using myparams = tpu_params; + using myparams = matrix_params; size_t dmsize = myparams::M; size_t dnsize = myparams::N; size_t dksize = myparams::K; - std::cout << "sizes of XMX8 tpu_params chosen by the user are: M " << dmsize - << " N " << dnsize << " K " << dksize << std::endl; + std::cout << "sizes of Intel XMX of architecture::intel_gpu_dg1 " + "matrix_params chosen by the user are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; // sizes-only query: types are given, generate default sizes - using myparams2 = tpu_params; - myparams2 p; + using myparams2 = + matrix_params; dmsize = myparams2::M; dnsize = myparams2::N; dksize = myparams2::K; - std::cout << "Default XMX8 sizes are: M " << dmsize << " N " << dnsize - << " K " << dksize << "\n XMX8 int8 num combinations is " - << p.num_combinations << std::endl; - - dmsize = myparams2::combinations[0].msize; - dnsize = myparams2::combinations[0].nsize; - dksize = myparams2::combinations[0].ksize; - std::cout << "one of XMX8 combination sizes is: M " << dmsize << " N " - << dnsize << " K " << dksize << std::endl; + std::cout << "Default Intel XMX of architecture::intel_gpu_dg1 sizes are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + return; +} + +void query_xmx_ats() { + + // generates combination assert + // using myparams = matrix_params; + + // generate combination of type assert + // using myparams = matrix_params; - // general query: types are not given - tpu_params myparams3; - - if (myparams3.num_scopes > 0) - if (myparams3.scopes[0] == scope_t::sub_group) - std::cout << "There are " << myparams3.num_scopes - << " Scopes that are supported by XMX8 implementation and " - "subgroup is one of them " - << std::endl; - - std::cout << "XMX8 query num combinations: " << myparams3.num_combinations - << std::endl; - - if (myparams3.combinations[0].msize == 0) // this is not a max params hardware - return; - constexpr int msize = myparams3.combinations[0].msize; - constexpr int nsize = myparams3.combinations[0].nsize; - constexpr int ksize = myparams3.combinations[0].ksize; - std::cout << "XMX8 query sizes are: M " << msize << " N " << nsize << " K " - << ksize << std::endl; - std::cout << "XMX8 query max sizes are: M " - << myparams3.combinations[0].max_msize << " N " - << myparams3.combinations[0].max_nsize << " K " - << myparams3.combinations[0].max_ksize << std::endl; - - size_t NDRangeM = 1024 / msize; - size_t NDRangeN = 1024 / nsize; - queue q; - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN}, {1, 1}), - [msize, ksize, nsize](nd_item<2> spmd_item) { - sub_group sg = spmd_item.get_sub_group(); - myparams2::joint_matrix_a sub_a1; - myparams2::joint_matrix_b< - sub_group, sycl::ext::intel::experimental::matrix::layout::packed> - sub_b1; - myparams2::joint_matrix_accumulator sub_c1; - - joint_matrix sub_a; - joint_matrix sub_b; - joint_matrix sub_c; - }); - }); + // tells whether a combination is valid or not, if valid, those will be set as + // default + using myparams = matrix_params; + + size_t dmsize = myparams::M; + size_t dnsize = myparams::N; + size_t dksize = myparams::K; + std::cout << "sizes of Intel XMX of architecture::intel_gpu_dg2_g10 " + "matrix_params chosen by the user are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + + // sizes-only query: types are given, generate default sizes + using myparams2 = + matrix_params; + dmsize = myparams2::M; + dnsize = myparams2::N; + dksize = myparams2::K; + std::cout + << "Default Intel XMX of architecture::intel_gpu_dg2_g10 sizes are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + return; +} + +void query_xmx_pvc() { + + // generates combination assert + // using myparams = matrix_params; + + // generate combination of type assert + // using myparams = matrix_params; + + // tells whether a combination is valid or not, if valid, those will be set as + // default + using myparams = matrix_params; + + size_t dmsize = myparams::M; + size_t dnsize = myparams::N; + size_t dksize = myparams::K; + std::cout << "sizes of architecture::intel_gpu_pvc matrix_params chosen by " + "the user are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + + // sizes-only query: types are given, generate default sizes + using myparams2 = + matrix_params; + dmsize = myparams2::M; + dnsize = myparams2::N; + dksize = myparams2::K; + std::cout << "Default Intel XMX of architecture::intel_gpu_pvc sizes are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; + return; } int main() { - query_amx(); - query_xmx8(); + query_amx_spr(); + query_xmx_dg(); + query_xmx_ats(); + query_xmx_pvc(); return 0; } diff --git a/sycl/test/matrix/runtime-query.cpp b/sycl/test/matrix/runtime-query.cpp new file mode 100644 index 000000000000..64090faad465 --- /dev/null +++ b/sycl/test/matrix/runtime-query.cpp @@ -0,0 +1,50 @@ +// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o runtime-query %s +// XFAIL: * + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental::matrix; + +template +void matrix_runtime_query(queue q) { + + std::vector combinations = + q.get_device().get_info(); + + std::cout << "The matrix hardware implementation in this device provides " + "this number of combinations: " + << combinations.size() << std::endl; + + bool max_sizes; + if (combinations[0].maxsize == 0) + max_sizes = true; // this is a max params hardware + else + max_sizes = false; + for (int i = 0; i < combinations.size(); i++) { + if (Ta == combinations[i].atype && Tb == combinations[i].btype && + Tc == combinations[i].ctype && Td == combinations[i].dtype) { + // joint matrix GEMM kernel can be called using these sizes + if (max_sizes) + std::cout << "The matrix hardware implementation in this device " + "provides the following max sizes are: M " + << combinations[i].max_msize << " N " + << combinations[i].max_nsize << " K " + << combinations[i].max_ksize << std::endl; + else + std::cout << "The matrix hardware implementation in this device " + "provides the following sizes are: M " + << combinations[i].msize << " N " << combinations[i].nsize + << " K " << combinations[i].ksize << std::endl; + } + } +} + +int main() { + queue q; + matrix_runtime_query(q); + return 0; +} From 3c42caca1eb0350ea15cb63843a8003804336fe9 Mon Sep 17 00:00:00 2001 From: Dounia Date: Thu, 31 Aug 2023 10:16:58 -0700 Subject: [PATCH 02/13] Add dg2 names --- .../sycl_ext_oneapi_matrix.asciidoc | 40 ++++++----- .../ext/oneapi/matrix/static-query-use.hpp | 67 +------------------ 2 files changed, 27 insertions(+), 80 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 94c2bebe0490..2f215228bbea 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -875,32 +875,40 @@ table below. ==== Intel XMX Supported Combinations This is currently available in devices with the architecture -`architecture::intel_gpu_pvc` and `architecture::intel_gpu_dg2`. In -these architectures' implementation, the type of the C matrix must be -the same as the type of the D matrix. Therefore, that common type is -shown in a single column in the table below. +`architecture::intel_gpu_pvc`, `architecture::intel_gpu_dg2_g10`, +`architecture::intel_gpu_dg2_g11`, and +`architecture::intel_gpu_dg2_g12`. In these architectures' +implementation, the type of the C matrix must be the same as the type +of the D matrix. Therefore, that common type is shown in a single +column in the table below. [frame="none",options="header"] |====================== | A type | B type | C and D type | M | N | K | device | `matrix_type::uint8` | `matrix_type::uint8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | architecture::intel_gpu_pvc -| | | | |8||architecture::intel_gpu_dg2 +`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` +| | | | |8||`architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` | `matrix_type::uint8` | `matrix_type::int8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | architecture::intel_gpu_pvc -| | | | |8||architecture::intel_gpu_dg2 +`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` +| | | | |8||`architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` | `matrix_type::int8` | `matrix_type::uint8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | architecture::intel_gpu_pvc -| | | | |8||architecture::intel_gpu_dg2 +`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` +| | | | |8||`architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` | `matrix_type::int8` | `matrix_type::int8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | architecture::intel_gpu_pvc -| | | | |8||architecture::intel_gpu_dg2 +`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` +| | | | |8||`architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` | `matrix_type::fp16` | `matrix_type::fp16` | -`matrix_type::fp32` | +<=+ 8 | 16 | 16 | architecture::intel_gpu_pvc -| | | | |8|| architecture::intel_gpu_dg2 +`matrix_type::fp32` | +<=+ 8 | 16 | 16 | `architecture::intel_gpu_pvc` +| | | | |8|| `architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` | `matrix_type::bf16` | `matrix_type::bf16` | -`matrix_type::fp32` | +<=+ 8 | 16 | 16 | architecture::intel_gpu_pvc -| | | | |8|| architecture::intel_gpu_dg2 +`matrix_type::fp32` | +<=+ 8 | 16 | 16 | `architecture::intel_gpu_pvc` +| | | | |8|| `architecture::intel_gpu_dg2_g10, +architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` |====================== ==== Nvidia Tensor Cores Supported Combinations diff --git a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp index a3db6eca3ec3..cfc699d86b38 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/static-query-use.hpp @@ -220,67 +220,6 @@ constexpr bool are_types_valid_xmx8() { // Default-values query: // Specialization for when only types are given, need to query only sizes -template -struct matrix_params< - architecture::intel_gpu_dg1, Ta, Tb, Tc, Td, 0, 0, 0, - typename std::enable_if<(!std::is_same_v && - !std::is_same_v && - !std::is_same_v)>::type> { - static_assert((are_types_valid_xmx8()), - "Invalid types for architecture::intel_gpu_dg1, supported " - "types are int8_t, uint8_t, half, and bf16"); - - // construct the matrices using the default sizes - - static constexpr std::size_t M = 8; - static constexpr std::size_t N = 8; - static constexpr std::size_t K = ((sizeof(Ta) == 1) ? 32 : 16); - - template - using joint_matrix_a = joint_matrix; - template - using joint_matrix_b = joint_matrix; - template - using joint_matrix_c = joint_matrix; - template - using joint_matrix_d = joint_matrix; -}; - -// Validation query: -// Specialization when both types and sizes are given -template -struct matrix_params< - architecture::intel_gpu_dg1, Ta, Tb, Tc, Td, sM, sN, sK, - typename std::enable_if<( - !std::is_same_v && !std::is_same_v && - !std::is_same_v && sM != 0 && sN != 0 && sK != 0)>::type> { - // Validate that parameters are supported - static_assert( - (sM == 0 && sN == 0 && sK == 0) || - (is_combination_valid_xmx8(sM, sN, sK)), - "Invalid parameters for XMX8, query valid combinations " - "using: " - "q.get_device().get_info()"); - - // if combination is valid, construct the matrices - static constexpr std::size_t M = sM; - static constexpr std::size_t N = sN; - static constexpr std::size_t K = sK; - - template - using joint_matrix_a = joint_matrix; - template - using joint_matrix_b = joint_matrix; - template - using joint_matrix_c = joint_matrix; - template - using joint_matrix_d = joint_matrix; -}; - -// Default-values query: -// Specialization for when only types are given, need to query only sizes - template struct matrix_params< architecture::intel_gpu_dg2_g10, Ta, Tb, Tc, Td, 0, 0, 0, @@ -288,7 +227,7 @@ struct matrix_params< !std::is_same_v && !std::is_same_v)>::type> { static_assert((are_types_valid_xmx8()), - "Invalid types for architecture::intel_gpu_dg1, supported " + "Invalid types for architecture::intel_gpu_dg2_g10, supported " "types are int8_t, uint8_t, half, and bf16"); // construct the matrices using the default sizes @@ -349,7 +288,7 @@ struct matrix_params< !std::is_same_v && !std::is_same_v)>::type> { static_assert((are_types_valid_xmx8()), - "Invalid types for architecture::intel_gpu_dg1, supported " + "Invalid types for architecture::intel_gpu_dg2_g11, supported" "types are int8_t, uint8_t, half, and bf16"); // construct the matrices using the default sizes @@ -410,7 +349,7 @@ struct matrix_params< !std::is_same_v && !std::is_same_v)>::type> { static_assert((are_types_valid_xmx8()), - "Invalid types for architecture::intel_gpu_dg1, supported " + "Invalid types for architecture::intel_gpu_dg2_g12, supported " "types are int8_t, uint8_t, half, and bf16"); // construct the matrices using the default sizes From 3ca9bfdd0ea8fe90f30d725e460ab923d39feecf Mon Sep 17 00:00:00 2001 From: Dounia Date: Thu, 7 Sep 2023 11:31:28 -0700 Subject: [PATCH 03/13] Address Greg's comment: merge blank cells --- .../sycl_ext_oneapi_matrix.asciidoc | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 2f215228bbea..734f7858d0ac 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -885,29 +885,29 @@ column in the table below. [frame="none",options="header"] |====================== | A type | B type | C and D type | M | N | K | device -| `matrix_type::uint8` | `matrix_type::uint8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` -| | | | |8||`architecture::intel_gpu_dg2_g10, +.2+| `matrix_type::uint8` .2+| `matrix_type::uint8` .2+| +`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 +|`architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -| `matrix_type::uint8` | `matrix_type::int8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` -| | | | |8||`architecture::intel_gpu_dg2_g10, +.2+| `matrix_type::uint8` .2+| `matrix_type::int8` .2+| +`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 | +`architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -| `matrix_type::int8` | `matrix_type::uint8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` -| | | | |8||`architecture::intel_gpu_dg2_g10, +.2+| `matrix_type::int8` .2+| `matrix_type::uint8` .2+| +`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 | +`architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -| `matrix_type::int8` | `matrix_type::int8` | -`matrix_type::int32` | +<=+ 8 | 16 | 32 | `architecture::intel_gpu_pvc` -| | | | |8||`architecture::intel_gpu_dg2_g10, +.2+| `matrix_type::int8` .2+| `matrix_type::int8` .2+| +`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 | +`architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -| `matrix_type::fp16` | `matrix_type::fp16` | -`matrix_type::fp32` | +<=+ 8 | 16 | 16 | `architecture::intel_gpu_pvc` -| | | | |8|| `architecture::intel_gpu_dg2_g10, +.2+|`matrix_type::fp16` .2+| `matrix_type::fp16` .2+| +`matrix_type::fp32` .2+| +<=+ 8 | 16 .2+| 16 | +`architecture::intel_gpu_pvc`|8| `architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -| `matrix_type::bf16` | `matrix_type::bf16` | -`matrix_type::fp32` | +<=+ 8 | 16 | 16 | `architecture::intel_gpu_pvc` -| | | | |8|| `architecture::intel_gpu_dg2_g10, +.2+| `matrix_type::bf16` .2+| `matrix_type::bf16` .2+| +`matrix_type::fp32` .2+| +<=+ 8 | 16 .2+| 16 | +`architecture::intel_gpu_pvc` |8| `architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` |====================== From 833b8da5523f0ad1f64c6e832df6751add589ff2 Mon Sep 17 00:00:00 2001 From: Dounia Date: Thu, 14 Sep 2023 09:02:41 -0700 Subject: [PATCH 04/13] fix typo in XMX and NVIDIA tables: matrix_type::int32 should be matrix_type::sint32 --- .../sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 734f7858d0ac..3d277c71ec7a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -886,19 +886,19 @@ column in the table below. |====================== | A type | B type | C and D type | M | N | K | device .2+| `matrix_type::uint8` .2+| `matrix_type::uint8` .2+| -`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 +`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 |`architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` .2+| `matrix_type::uint8` .2+| `matrix_type::int8` .2+| -`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 | +`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | `architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` .2+| `matrix_type::int8` .2+| `matrix_type::uint8` .2+| -`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 | +`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | `architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` .2+| `matrix_type::int8` .2+| `matrix_type::int8` .2+| -`matrix_type::int32` .2+| +<=+ 8 | 16 .2+| 32 | +`matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | `architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` .2+|`matrix_type::fp16` .2+| `matrix_type::fp16` .2+| @@ -940,11 +940,11 @@ supported parameter combination is specified in the following table. |16 |16 |16 |8 |32 |16 |32 |8 |16 -.3+| `matrix_type::int8` .3+| `matrix_type::int32` +.3+| `matrix_type::int8` .3+| `matrix_type::sint32` |16 |16 |16 .6+| sm_72 |8 |32 |16 |32 |8 |16 -.3+|`matrix_type::uint8` .3+|`matrix_type::int32` +.3+|`matrix_type::uint8` .3+|`matrix_type::sint32` |16 |16 |16 |8 |32 |16 |32 |8 |16 From afe6471b588faea56aa0ad386defe071f0f1f7aa Mon Sep 17 00:00:00 2001 From: Dounia Date: Fri, 15 Sep 2023 08:56:15 -0700 Subject: [PATCH 05/13] Add clarification about SPR enumerator AOT limitation and rename matrix::combinations to matrix_combinations --- .../sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc | 10 +++++----- .../sycl_ext_oneapi_device_architecture.asciidoc | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 3d277c71ec7a..3ca5a8831bb9 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -741,12 +741,12 @@ descriptors that can be queried using `get_info` API. [frame="none",options="header"] |====================== | Device descriptors | Return type| Description -|`ext::oneapi::experimental::info::device::matrix::combinations` | +|`ext::oneapi::experimental::info::device::matrix_combinations` | `std::vector`| tells the set of supported matrix sizes and types on this device |====================== -The runtime query returns a vector of `combinations` of `combination` +The runtime query returns a vector of `matrix_combinations` of `combination` type. Each combination includes the sizes and the types for the matrices A, B, C, and D. Note that for each matrix hardware, the query returns `max_msize, max_nsize, max_ksize` or `msize, nsize, @@ -790,7 +790,7 @@ struct combination { } // namespace sycl::ext::oneapi::experimental::matrix ``` -Each combination of the `combinations` vector composes the types and +Each combination of the `matrix_combinations` vector composes the types and sizes of A, B, C, and D matrices supported by the device implementation. The table below provides a description of each member of the `combination` struct. @@ -832,7 +832,7 @@ the `T` template parameter as follows: + ```c++ // Ta, Tb, Tc, and Td are the types used in applications std::vector combinations = - device.get_info(); + device.get_info(); for (int i = 0; sizeof(combinations); i++) { if (Ta == combinations[i].atype && Tb == combinations[i].btype && @@ -849,7 +849,7 @@ for (int i = 0; sizeof(combinations); i++) { The table below provides a list of the combinations that `joint_matrix` implementations support on each of Intel AMX and Intel XMX hardware. Note that these can be returned using -`ext::oneapi::experimental::info::device::matrix::combinations`. +`ext::oneapi::experimental::info::device::matrix_combinations`. ==== Intel AMX Supported Combinations This is currently available in devices with the architecture diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index cd8f878f56f7..993f8874fd57 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -196,6 +196,10 @@ of these enumerators, and it provides a brief description of their meanings. |- |Any CPU device with the x86_64 instruction set. +|`intel_cpu_spr` +|- +|Intel Xeon processor codenamed Sapphire Rapids. + |`intel_gpu_bdw` |- |Broadwell Intel graphics architecture. @@ -589,6 +593,7 @@ feature, the application must be compiled in ahead-of-time (AOT) mode using "special target values" listed in the link:../../UsersManual.md[users manual] description of the `-fsycl-targets` option. These are the target names of the form "intel_gpu_*", "nvidia_gpu_*", or "amd_gpu_*". +The support of AOT compilation for `intel_cpu_spr` is currently unavailable. == Future direction From 1092032119273e15724ed73baff0ab35e771d5d3 Mon Sep 17 00:00:00 2001 From: Dounia Date: Fri, 15 Sep 2023 09:01:39 -0700 Subject: [PATCH 06/13] minor: change test name --- sycl/{test/matrix/query-use.cpp => compile-time-query.cpp} | 0 sycl/test/matrix/runtime-query.cpp | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename sycl/{test/matrix/query-use.cpp => compile-time-query.cpp} (100%) diff --git a/sycl/test/matrix/query-use.cpp b/sycl/compile-time-query.cpp similarity index 100% rename from sycl/test/matrix/query-use.cpp rename to sycl/compile-time-query.cpp diff --git a/sycl/test/matrix/runtime-query.cpp b/sycl/test/matrix/runtime-query.cpp index 64090faad465..d6ccedf24b87 100644 --- a/sycl/test/matrix/runtime-query.cpp +++ b/sycl/test/matrix/runtime-query.cpp @@ -12,7 +12,7 @@ template void matrix_runtime_query(queue q) { std::vector combinations = - q.get_device().get_info(); + q.get_device().get_info(); std::cout << "The matrix hardware implementation in this device provides " "this number of combinations: " From ca43df65b481c0c582243b00297103bd24b33378 Mon Sep 17 00:00:00 2001 From: Dounia Date: Fri, 15 Sep 2023 09:03:31 -0700 Subject: [PATCH 07/13] minor: fix directory accident change --- sycl/{ => test/matrix}/compile-time-query.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sycl/{ => test/matrix}/compile-time-query.cpp (100%) diff --git a/sycl/compile-time-query.cpp b/sycl/test/matrix/compile-time-query.cpp similarity index 100% rename from sycl/compile-time-query.cpp rename to sycl/test/matrix/compile-time-query.cpp From 7164c664d56db99aeb9a97e31c3896510349fab2 Mon Sep 17 00:00:00 2001 From: Dounia Date: Fri, 15 Sep 2023 09:23:51 -0700 Subject: [PATCH 08/13] fix format --- sycl/test/matrix/compile-time-query.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/test/matrix/compile-time-query.cpp b/sycl/test/matrix/compile-time-query.cpp index e39361bbeead..ccab39f90eba 100644 --- a/sycl/test/matrix/compile-time-query.cpp +++ b/sycl/test/matrix/compile-time-query.cpp @@ -1,4 +1,5 @@ -// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o compile-time-query %s +// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o compile-time-query +// %s #include #include From 06b2debeb474774e4f1189d19387dd969f40c29c Mon Sep 17 00:00:00 2001 From: Dounia Date: Fri, 15 Sep 2023 10:01:55 -0700 Subject: [PATCH 09/13] test name change again to avoid format issue --- sycl/test/matrix/{compile-time-query.cpp => compile-query.cpp} | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) rename sycl/test/matrix/{compile-time-query.cpp => compile-query.cpp} (99%) diff --git a/sycl/test/matrix/compile-time-query.cpp b/sycl/test/matrix/compile-query.cpp similarity index 99% rename from sycl/test/matrix/compile-time-query.cpp rename to sycl/test/matrix/compile-query.cpp index ccab39f90eba..133c9b8eb5ec 100644 --- a/sycl/test/matrix/compile-time-query.cpp +++ b/sycl/test/matrix/compile-query.cpp @@ -1,5 +1,4 @@ -// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o compile-time-query -// %s +// RUN: %clangxx -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -fsycl -o compile-query %s #include #include From 7afef97391db8c8e43457b9d91626f0a1b93f8d5 Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 18 Sep 2023 07:00:35 -0700 Subject: [PATCH 10/13] Address Greg's comments and remove ATS from the test case because for names, dg2 is used --- ...cl_ext_oneapi_device_architecture.asciidoc | 15 ++++-- sycl/test/matrix/compile-query.cpp | 50 +++---------------- 2 files changed, 19 insertions(+), 46 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index 993f8874fd57..50d8b6c8b77c 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -198,7 +198,9 @@ of these enumerators, and it provides a brief description of their meanings. |`intel_cpu_spr` |- -|Intel Xeon processor codenamed Sapphire Rapids. +|Intel Xeon processor codenamed Sapphire Rapids. The utility of this +enumeration is currently limited. See the section "Limitations with +the experimental version" for details. |`intel_gpu_bdw` |- @@ -251,7 +253,7 @@ of these enumerators, and it provides a brief description of their meanings. |`intel_gpu_adl_s` + `intel_gpu_rpl_s` |- -|Alder Lake S Intel graphics architecture or Raptor Lake Intel graphics +|Alder Lake S Intel graphics architecture or Raptor Lake Intel graphics architecture. |`intel_gpu_adl_p` @@ -593,7 +595,14 @@ feature, the application must be compiled in ahead-of-time (AOT) mode using "special target values" listed in the link:../../UsersManual.md[users manual] description of the `-fsycl-targets` option. These are the target names of the form "intel_gpu_*", "nvidia_gpu_*", or "amd_gpu_*". -The support of AOT compilation for `intel_cpu_spr` is currently unavailable. +The architecture enumeration `intel_cpu_spr` does not currently work +with any of the APIs described in this extension. It cannot be used +with the `if_architecture_is` function, the +`device::ext_oneapi_architecture_is` function, or the +`info::device::architecture` query descriptor. It currently exists +only for use with the +link:sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc[sycl_ext_oneapi_matrix] +extension. == Future direction diff --git a/sycl/test/matrix/compile-query.cpp b/sycl/test/matrix/compile-query.cpp index 133c9b8eb5ec..a430c0c6e228 100644 --- a/sycl/test/matrix/compile-query.cpp +++ b/sycl/test/matrix/compile-query.cpp @@ -39,53 +39,19 @@ void query_amx_spr() { return; } -void query_xmx_dg() { +void query_xmx_dg2() { // generates combination assert - // using myparams = matrix_params; + // using myparams = matrix_params; // generate combination of type assert - // using myparams = matrix_params; + // using myparams = matrix_params; // tells whether a combination is valid or not, if valid, those will be set as // default - using myparams = matrix_params; - size_t dmsize = myparams::M; - size_t dnsize = myparams::N; - size_t dksize = myparams::K; - std::cout << "sizes of Intel XMX of architecture::intel_gpu_dg1 " - "matrix_params chosen by the user are: M " - << dmsize << " N " << dnsize << " K " << dksize << std::endl; - - // sizes-only query: types are given, generate default sizes - using myparams2 = - matrix_params; - dmsize = myparams2::M; - dnsize = myparams2::N; - dksize = myparams2::K; - std::cout << "Default Intel XMX of architecture::intel_gpu_dg1 sizes are: M " - << dmsize << " N " << dnsize << " K " << dksize << std::endl; - return; -} - -void query_xmx_ats() { - - // generates combination assert - // using myparams = matrix_params; - - // generate combination of type assert - // using myparams = matrix_params; - - // tells whether a combination is valid or not, if valid, those will be set as - // default - using myparams = matrix_params; - size_t dmsize = myparams::M; size_t dnsize = myparams::N; size_t dksize = myparams::K; @@ -99,9 +65,8 @@ void query_xmx_ats() { dmsize = myparams2::M; dnsize = myparams2::N; dksize = myparams2::K; - std::cout - << "Default Intel XMX of architecture::intel_gpu_dg2_g10 sizes are: M " - << dmsize << " N " << dnsize << " K " << dksize << std::endl; + std::cout << "Default Intel XMX of architecture::intel_gpu_dg2_g10 sizes are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; return; } @@ -139,8 +104,7 @@ void query_xmx_pvc() { int main() { query_amx_spr(); - query_xmx_dg(); - query_xmx_ats(); + query_xmx_dg2(); query_xmx_pvc(); return 0; } From 430b66638a26f498e72172de277a1d25dc205ccf Mon Sep 17 00:00:00 2001 From: Dounia Date: Mon, 18 Sep 2023 07:04:53 -0700 Subject: [PATCH 11/13] fix formatting --- sycl/test/matrix/compile-query.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sycl/test/matrix/compile-query.cpp b/sycl/test/matrix/compile-query.cpp index a430c0c6e228..e110eef22a38 100644 --- a/sycl/test/matrix/compile-query.cpp +++ b/sycl/test/matrix/compile-query.cpp @@ -42,15 +42,17 @@ void query_amx_spr() { void query_xmx_dg2() { // generates combination assert - // using myparams = matrix_params; + // using myparams = matrix_params; // generate combination of type assert - // using myparams = matrix_params; + // using myparams = matrix_params; // tells whether a combination is valid or not, if valid, those will be set as // default - using myparams = matrix_params; + using myparams = matrix_params; size_t dmsize = myparams::M; size_t dnsize = myparams::N; @@ -65,8 +67,9 @@ void query_xmx_dg2() { dmsize = myparams2::M; dnsize = myparams2::N; dksize = myparams2::K; - std::cout << "Default Intel XMX of architecture::intel_gpu_dg2_g10 sizes are: M " - << dmsize << " N " << dnsize << " K " << dksize << std::endl; + std::cout + << "Default Intel XMX of architecture::intel_gpu_dg2_g10 sizes are: M " + << dmsize << " N " << dnsize << " K " << dksize << std::endl; return; } From 07673579e80be7e9536f90df4c0410811e8507c5 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 19 Sep 2023 09:44:52 -0700 Subject: [PATCH 12/13] Address Dmitry comment --- .../sycl_ext_oneapi_matrix.asciidoc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc index 3ca5a8831bb9..d7f2ab8d985f 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc @@ -863,11 +863,11 @@ table below. | A type | B type | C and D type | M | N | K | `matrix_type::uint8` | `matrix_type::uint8` | `matrix_type::sint32` | +<=+ 16 | +<=+ 16 | +<=+ 64 -| `matrix_type::uint8` | `matrix_type::int8` | +| `matrix_type::uint8` | `matrix_type::sint8` | `matrix_type::sint32` | +<=+ 16 | +<=+ 16 | +<=+ 64 -| `matrix_type::int8` | `matrix_type::uint8` | +| `matrix_type::sint8` | `matrix_type::uint8` | `matrix_type::sint32` | +<=+ 16 | +<=+ 16 | +<=+ 64 -| `matrix_type::int8` | `matrix_type::int8` | +| `matrix_type::sint8` | `matrix_type::sint8` | `matrix_type::sint32` | +<=+ 16 | +<=+ 16 | +<=+ 64 | `matrix_type::bf16` | `matrix_type::bf16` | `matrix_type::fp32` | +<=+ 16 | +<=+ 16 | +<=+ 32 @@ -889,15 +889,15 @@ column in the table below. `matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 |`architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -.2+| `matrix_type::uint8` .2+| `matrix_type::int8` .2+| +.2+| `matrix_type::uint8` .2+| `matrix_type::sint8` .2+| `matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | `architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -.2+| `matrix_type::int8` .2+| `matrix_type::uint8` .2+| +.2+| `matrix_type::sint8` .2+| `matrix_type::uint8` .2+| `matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | `architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` -.2+| `matrix_type::int8` .2+| `matrix_type::int8` .2+| +.2+| `matrix_type::sint8` .2+| `matrix_type::sint8` .2+| `matrix_type::sint32` .2+| +<=+ 8 | 16 .2+| 32 | `architecture::intel_gpu_pvc`|8|`architecture::intel_gpu_dg2_g10, architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12` @@ -940,7 +940,7 @@ supported parameter combination is specified in the following table. |16 |16 |16 |8 |32 |16 |32 |8 |16 -.3+| `matrix_type::int8` .3+| `matrix_type::sint32` +.3+| `matrix_type::sint8` .3+| `matrix_type::sint32` |16 |16 |16 .6+| sm_72 |8 |32 |16 |32 |8 |16 From 81ace9033ad9c9ee569be0eabc4d88bedb7d1133 Mon Sep 17 00:00:00 2001 From: Dounia Date: Wed, 20 Sep 2023 12:06:09 -0700 Subject: [PATCH 13/13] Add blank line for SPR limitation paragraph --- .../experimental/sycl_ext_oneapi_device_architecture.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index 50d8b6c8b77c..e6a51ed9d103 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -595,6 +595,7 @@ feature, the application must be compiled in ahead-of-time (AOT) mode using "special target values" listed in the link:../../UsersManual.md[users manual] description of the `-fsycl-targets` option. These are the target names of the form "intel_gpu_*", "nvidia_gpu_*", or "amd_gpu_*". + The architecture enumeration `intel_cpu_spr` does not currently work with any of the APIs described in this extension. It cannot be used with the `if_architecture_is` function, the