diff --git a/SYCL/Matrix/joint_matrix_query_default_tpu_sel.cpp b/SYCL/Matrix/joint_matrix_query_default_tpu_sel.cpp new file mode 100644 index 0000000000..01acc3a9e1 --- /dev/null +++ b/SYCL/Matrix/joint_matrix_query_default_tpu_sel.cpp @@ -0,0 +1,264 @@ +//==---- joint_matrix_query_default_tpu_sel.cpp - DPC++ joint_matrix-------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, + unsigned int vnniFactor) { + for (unsigned int i = 0; i < rows / vnniFactor; i++) { + for (unsigned int j = 0; j < cols; j++) { + for (unsigned int k = 0; k < vnniFactor; k++) { + dest[i * cols * vnniFactor + j * vnniFactor + k] = + src[(i * vnniFactor + k) * cols + j]; + } + } + } +} +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { + asm volatile("cpuid" + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) + : "0"(*eax), "2"(*ecx)); +} + +template +void matrix_multiply(queue q, big_matrix &C, big_matrix &A, + big_matrix &B) { + constexpr size_t SG_SZ = TN; + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +bool is_this_amx_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::cpu) { + std::cout << "This is a CPU Device" << std::endl; + unsigned eax, ebx, ecx, edx; + + eax = 1; /* processor info and feature bits */ + native_cpuid(&eax, &ebx, &ecx, &edx); + int model = (eax >> 4) & 0xF; + int family = (eax >> 8) & 0xF; + int extended_model = (eax >> 16) & 0xF; + int extended_family = (eax >> 20) & 0xFF; + + // SPR processsor means AMX is present + if (model == 15 && family == 6 && extended_model == 8 && + extended_family == 0) { + std::cout << "This is a SPR CPU Device" << std::endl; + return true; + } + } + return false; +} + +bool is_this_xmx16_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::gpu) { + std::cout << "This is a GPU Device" << std::endl; + + auto dev_name = + q.get_device().template get_info(); + size_t found = dev_name.find("Intel(R) Graphics [0x0bd5]"); // PVC + if (found != std::string::npos) { + std::cout << "This is a PVC Device which contains XMX-16 TPU" + << std::endl; + return true; + } + } + return false; +} + +bool is_this_xmx8_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::gpu) { + std::cout << "This is a GPU Device" << std::endl; + auto dev_name = + q.get_device().template get_info(); + size_t found = dev_name.find("Intel(R) Graphics [0x5610]"); // ATS-P + if (found != std::string::npos) { + std::cout << "This is a ATS-M Device which contains XMX-8 TPU" + << std::endl; + return true; + } + } + return false; +} + +template +bool query_and_matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + queue q; + + if (is_this_amx_device(q)) { + using myparams2 = tpu_params; + constexpr int TM = myparams2::M; + constexpr int TN = myparams2::N; + constexpr int TK = myparams2::K; + std::cout << "AMX query default sizes are: M " << TM << " N " << TN << " K " + << TK << std::endl; + matrix_multiply(q, C, A, B); + return true; + } + + if (is_this_xmx16_device(q)) { + using myparams2 = tpu_params; + constexpr int TM = myparams2::M; + constexpr int TN = myparams2::N; + constexpr int TK = myparams2::K; + std::cout << "XMX of PVC query default sizes are: M " << TM << " N " << TN + << " K " << TK << std::endl; + matrix_multiply(q, C, A, B); + return true; + } + + if (is_this_xmx8_device(q)) { + using myparams2 = tpu_params; + constexpr int TM = myparams2::M; + constexpr int TN = myparams2::N; + constexpr int TK = myparams2::K; + std::cout << "XMX of ATS-M query default sizes are: M " << TM << " N " << TN + << " K " << TK << std::endl; + matrix_multiply(q, C, A, B); + return true; + } + return false; +} + +void matrix_multiply_ref(int8_t *A, int8_t *B, int32_t *C, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + C[m * N + n] += A[m * K + k] * B[k * N + n]; + } + } +} + +int main() { + static constexpr size_t MATRIX_M = 128; + static constexpr size_t MATRIX_N = 128; + static constexpr size_t MATRIX_K = 128; + static constexpr int vnniFactor = 4; + int8_t A[MATRIX_M][MATRIX_K]; + int8_t B[MATRIX_K][MATRIX_N]; + int8_t Bvnni[MATRIX_K / 4][MATRIX_N * 4]; + int32_t C[MATRIX_M][MATRIX_N]; + int32_t D[MATRIX_M][MATRIX_N]; + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i + 2 * j; + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = i + j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + matrix_vnni(MATRIX_K, MATRIX_N, (int8_t *)&B, (int8_t *)&Bvnni, 4); + big_matrix MBvnni((int8_t *)&Bvnni); + + bool sel = query_and_matrix_multiply(MC, MA, MBvnni); + bool res = true; + if (sel) { + matrix_multiply_ref((int8_t *)&A, (int8_t *)&B, (int32_t *)&D, MATRIX_M, + MATRIX_N, MATRIX_K); + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + } else + std::cout << "default device selector has no TPU" << std::endl; + return !res; +} diff --git a/SYCL/Matrix/joint_matrix_query_general_constexpr_tpu_sel.cpp b/SYCL/Matrix/joint_matrix_query_general_constexpr_tpu_sel.cpp new file mode 100644 index 0000000000..bd8494d07b --- /dev/null +++ b/SYCL/Matrix/joint_matrix_query_general_constexpr_tpu_sel.cpp @@ -0,0 +1,367 @@ +//==--joint_matrix_query_general_constexpr_tpu_sel.cpp - DPC++ joint_matrix-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, + unsigned int vnniFactor) { + for (unsigned int i = 0; i < rows / vnniFactor; i++) { + for (unsigned int j = 0; j < cols; j++) { + for (unsigned int k = 0; k < vnniFactor; k++) { + dest[i * cols * vnniFactor + j * vnniFactor + k] = + src[(i * vnniFactor + k) * cols + j]; + } + } + } +} +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { + asm volatile("cpuid" + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) + : "0"(*eax), "2"(*ecx)); +} + +template +void matrix_multiply(queue q, big_matrix &C, big_matrix &A, + big_matrix &B) { + constexpr size_t SG_SZ = TN; + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +bool is_this_amx_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::cpu) { + std::cout << "This is a CPU Device" << std::endl; + unsigned eax, ebx, ecx, edx; + + eax = 1; /* processor info and feature bits */ + native_cpuid(&eax, &ebx, &ecx, &edx); + int model = (eax >> 4) & 0xF; + int family = (eax >> 8) & 0xF; + int extended_model = (eax >> 16) & 0xF; + int extended_family = (eax >> 20) & 0xFF; + + // SPR processsor means AMX is present + if (model == 15 && family == 6 && extended_model == 8 && + extended_family == 0) { + std::cout << "This is a SPR CPU Device" << std::endl; + return true; + } + } + return false; +} + +bool is_this_xmx16_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::gpu) { + std::cout << "This is a GPU Device" << std::endl; + auto dev_name = + q.get_device().template get_info(); + size_t found = dev_name.find("Intel(R) Graphics [0x0bd5]"); // PVC + if (found != std::string::npos) { + std::cout << "This is a PVC Device which contains XMX-16 TPU" + << std::endl; + return true; + } + } + return false; +} + +bool is_this_xmx8_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::gpu) { + std::cout << "This is a GPU Device" << std::endl; + auto dev_name = + q.get_device().template get_info(); + size_t found = dev_name.find("Intel(R) Graphics [0x5610]"); // ATS-M + if (found != std::string::npos) { + std::cout << "This is a ATS-M Device which contains XMX-8 TPU" + << std::endl; + return true; + } + } + return false; +} + +template +constexpr std::tuple +find_balanced_size(tpu_params params, int M, int optsize, matrix_type Ta, + matrix_type Tb, matrix_type Tc) { + // find smaller joint matrix size M that keeps maximum CUs busy + for (int i = 0; params.num_combinations; i++) { + if (Ta == params.combinations[i].atype && + Tb == params.combinations[i].btype && + Tc == params.combinations[i].accumulatortype) { + if (params.combinations[i].msize == 0) { + // this is max size query + if (optsize <= params.combinations[i].max_msize) + return std::make_tuple(optsize, params.combinations[i].max_nsize, + params.combinations[i].max_ksize); + } else { + if (params.combinations[i].msize == optsize) + return std::make_tuple(optsize, params.combinations[i].nsize, + params.combinations[i].ksize); + } + } + } + return {-1, -1, -1}; +} + +template bool is_sg_scope_supported(tpu_params params) { + for (int i = 0; i < params.num_scopes; i++) + if (params.scopes[i] == scope_t::sub_group) + return true; + return false; +} + +template +int query_and_matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + queue q; + // This use case shows M < Number of units in the device (M < NumUnits) + // when sizes are relatively small, we load balance all the compute units by + // using smaller matrix sizes and the maximum number of these compute units. + // This leads to faster execution + uint32_t max_compute_units = + q.get_device().template get_info(); + std::cout << "max compute units are " << max_compute_units << " and M is " + << M << std::endl; + if (is_this_amx_device(q)) { + constexpr int NumUnits = 224; + constexpr int ratio = NumUnits / M; // >=1 in this case + using defparams = tpu_params; + constexpr tpu_params genparams = tpu_params(); + // find the combination that fills these EUs + const int optsize = int((defparams::M) / ratio); + constexpr std::tuple optsizes = + find_balanced_size(genparams, M, optsize, matrix_type::sint8, + matrix_type::sint8, matrix_type::sint32); + constexpr int TM = std::get<0>(optsizes); + constexpr int TN = std::get<1>(optsizes); + constexpr int TK = std::get<2>(optsizes); + if (TM != -1 && is_sg_scope_supported(genparams)) { + std::cout << "sub-group scope is supported on AMX and sizes that leads " + "to faster execution are, M " + << TM << " N " << TN << " TK " << TK << std::endl; + matrix_multiply(q, C, A, B); + return 1; + } else if (TM == -1 && is_sg_scope_supported(genparams)) { + constexpr int TM = defparams::M; + std::cout + << "sub-group scope is supported on XMX16 of PVC but M is default " + << TM << std::endl; + matrix_multiply(q, C, A, + B); + return 1; + } else + return 2; + } + + if (is_this_xmx16_device(q)) { + constexpr int NumUnits = 1024; + constexpr int ratio = NumUnits / M; + using defparams = tpu_params; + constexpr tpu_params genparams = tpu_params(); + + // find the combination that fills these EUs + const int optsize = int((defparams::M) / ratio); + + constexpr std::tuple optsizes = + find_balanced_size(genparams, M, optsize, matrix_type::sint8, + matrix_type::sint8, matrix_type::sint32); + constexpr int TM = std::get<0>(optsizes); + constexpr int TN = std::get<1>(optsizes); + constexpr int TK = std::get<2>(optsizes); + if (TM != -1 && is_sg_scope_supported(genparams)) { + std::cout << "sub-group scope is supported on XMX16 of PVC and sizes " + "that lead to faster execution are, M " + << TM << " N " << TN << " TK " << TK << std::endl; + matrix_multiply(q, C, A, B); + return 1; + } else if (TM == -1 && is_sg_scope_supported(genparams)) { + constexpr int TM = defparams::M; + std::cout + << "sub-group scope is supported on XMX16 of PVC but M is default " + << TM << std::endl; + matrix_multiply(q, C, A, + B); + return 1; + } else + return 2; + } + + if (is_this_xmx8_device(q)) { + constexpr int NumUnits = 1024; + constexpr int ratio = NumUnits / M; + using defparams = tpu_params; + constexpr tpu_params genparams = tpu_params(); + // find the combination that fills these EUs + const int optsize = int((defparams::M) / ratio); + + constexpr std::tuple optsizes = + find_balanced_size(genparams, M, optsize, matrix_type::sint8, + matrix_type::sint8, matrix_type::sint32); + constexpr int TM = std::get<0>(optsizes); + constexpr int TN = std::get<1>(optsizes); + constexpr int TK = std::get<2>(optsizes); + if (TM != -1 && is_sg_scope_supported(genparams)) { + std::cout << "sub-group scope is supported on XMX16 of PVC and M that " + "leads to faster execution is " + << TM << std::endl; + matrix_multiply(q, C, A, B); + return 1; + } else if (TM == -1 && is_sg_scope_supported(genparams)) { + constexpr int TM = defparams::M; + std::cout + << "sub-group scope is supported on XMX8 of ATS-M but M is default " + << TM << std::endl; + matrix_multiply(q, C, A, + B); + return 1; + } else + return 2; + } + return 0; +} + +void matrix_multiply_ref(int8_t *A, int8_t *B, int32_t *C, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + C[m * N + n] += A[m * K + k] * B[k * N + n]; + } + } +} + +int main() { + static constexpr size_t MATRIX_M = 128; + static constexpr size_t MATRIX_N = 128; + static constexpr size_t MATRIX_K = 128; + static constexpr int vnniFactor = 4; + int8_t A[MATRIX_M][MATRIX_K]; + int8_t B[MATRIX_K][MATRIX_N]; + int8_t Bvnni[MATRIX_K / 4][MATRIX_N * 4]; + int32_t C[MATRIX_M][MATRIX_N]; + int32_t D[MATRIX_M][MATRIX_N]; + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i + 2 * j; + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = i + j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + matrix_vnni(MATRIX_K, MATRIX_N, (int8_t *)&B, (int8_t *)&Bvnni, 4); + big_matrix MBvnni((int8_t *)&Bvnni); + + bool sel = query_and_matrix_multiply(MC, MA, MBvnni); + bool res = true; + if (sel == 1) { + matrix_multiply_ref((int8_t *)&A, (int8_t *)&B, (int32_t *)&D, MATRIX_M, + MATRIX_N, MATRIX_K); + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + } else + ((sel == 0) ? std::cout << "default device selector has no TPU" << std::endl + : std::cout << "Selected combination is not supported by the " + "default device selector" + << std::endl); + return !res; +} diff --git a/SYCL/Matrix/joint_matrix_query_general_tpu_sel.cpp b/SYCL/Matrix/joint_matrix_query_general_tpu_sel.cpp new file mode 100644 index 0000000000..23dfc22857 --- /dev/null +++ b/SYCL/Matrix/joint_matrix_query_general_tpu_sel.cpp @@ -0,0 +1,311 @@ +//==----- joint_matrix_query_general_tpu_sel.cpp - DPC++ joint_matrix------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +template struct big_matrix { +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, + unsigned int vnniFactor) { + for (unsigned int i = 0; i < rows / vnniFactor; i++) { + for (unsigned int j = 0; j < cols; j++) { + for (unsigned int k = 0; k < vnniFactor; k++) { + dest[i * cols * vnniFactor + j * vnniFactor + k] = + src[(i * vnniFactor + k) * cols + j]; + } + } + } +} +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { + asm volatile("cpuid" + : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) + : "0"(*eax), "2"(*ecx)); +} + +template +void matrix_multiply(queue q, big_matrix &C, big_matrix &A, + big_matrix &B) { + constexpr size_t SG_SZ = TN; + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + + joint_matrix sub_a; + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +bool is_this_amx_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::cpu) { + std::cout << "This is a CPU Device" << std::endl; + unsigned eax, ebx, ecx, edx; + + eax = 1; /* processor info and feature bits */ + native_cpuid(&eax, &ebx, &ecx, &edx); + int model = (eax >> 4) & 0xF; + int family = (eax >> 8) & 0xF; + int extended_model = (eax >> 16) & 0xF; + int extended_family = (eax >> 20) & 0xFF; + + // SPR processsor means AMX is present + if (model == 15 && family == 6 && extended_model == 8 && + extended_family == 0) { + std::cout << "This is a SPR CPU Device" << std::endl; + return true; + } + } + return false; +} + +bool is_this_xmx16_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::gpu) { + std::cout << "This is a GPU Device" << std::endl; + auto dev_name = + q.get_device().template get_info(); + size_t found = dev_name.find("Intel(R) Graphics [0x0bd5]"); // PVC + if (found != std::string::npos) { + std::cout << "This is a PVC Device which contains XMX-16 TPU" + << std::endl; + return true; + } + } + return false; +} + +bool is_this_xmx8_device(queue q) { + auto dev_type = + q.get_device().template get_info(); + if (dev_type == info::device_type::gpu) { + std::cout << "This is a GPU Device" << std::endl; + auto dev_name = + q.get_device().template get_info(); + size_t found = dev_name.find("Intel(R) Graphics [0x5610]"); // ATS-M + if (found != std::string::npos) { + std::cout << "This is a ATS-M Device which contains XMX-8 TPU" + << std::endl; + return true; + } + } + return false; +} + +template +bool find_size_combinations(tpu_params params, int M, matrix_type Ta, + matrix_type Tb, matrix_type Tc) { + for (int i = 0; params.num_combinations; i++) { + if (Ta == params.combinations[i].atype && + Tb == params.combinations[i].btype && + Tc == params.combinations[i].accumulatortype) { + if (params.combinations[i].msize == 0) { + // this is max size query + if (M <= params.combinations[i].max_msize) + return true; + } else { + if (params.combinations[i].msize == M) { + return true; + } + } + } + } + return false; +} + +template bool is_sg_scope_supported(tpu_params params) { + for (int i = 0; i < params.num_scopes; i++) + if (params.scopes[i] == scope_t::sub_group) + return true; + return false; +} + +template +int query_and_matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + queue q; + + if (is_this_amx_device(q)) { + using myparams = tpu_params; + constexpr int TM = 4; + constexpr int TN = myparams::N; + constexpr int TK = myparams::K; + tpu_params params = tpu_params(); + if (find_size_combinations(params, TM, matrix_type::sint8, + matrix_type::sint8, matrix_type::sint32) && + is_sg_scope_supported(params)) { + std::cout << "M = 4 and sub-group scope is supported on AMX" << std::endl; + matrix_multiply(q, C, A, B); + return 1; + } else + return 2; + } + + if (is_this_xmx16_device(q)) { + using myparams = tpu_params; + constexpr int TM = 4; + constexpr int TN = myparams::N; + constexpr int TK = myparams::K; + tpu_params params = tpu_params(); + if (find_size_combinations(params, TM, matrix_type::sint8, + matrix_type::sint8, matrix_type::sint32) && + is_sg_scope_supported(params)) { + std::cout << "M = 4 and sub-group scope is supported on XMX16 of PVC" + << std::endl; + matrix_multiply(q, C, A, B); + return 1; + } else + return 2; + } + + if (is_this_xmx8_device(q)) { + using myparams = tpu_params; + constexpr int TM = 4; + constexpr int TN = myparams::N; + constexpr int TK = myparams::K; + tpu_params params = tpu_params(); + if (find_size_combinations(params, TM, matrix_type::sint8, + matrix_type::sint8, matrix_type::sint32) && + is_sg_scope_supported(params)) { + std::cout << "M = 4 and sub-group scope is supported on XMX8 of ATS-M" + << std::endl; + matrix_multiply(q, C, A, B); + return 1; + } else + return 2; + } + return 0; +} + +void matrix_multiply_ref(int8_t *A, int8_t *B, int32_t *C, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + C[m * N + n] += A[m * K + k] * B[k * N + n]; + } + } +} + +int main() { + static constexpr size_t MATRIX_M = 128; + static constexpr size_t MATRIX_N = 128; + static constexpr size_t MATRIX_K = 128; + static constexpr int vnniFactor = 4; + int8_t A[MATRIX_M][MATRIX_K]; + int8_t B[MATRIX_K][MATRIX_N]; + int8_t Bvnni[MATRIX_K / 4][MATRIX_N * 4]; + int32_t C[MATRIX_M][MATRIX_N]; + int32_t D[MATRIX_M][MATRIX_N]; + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i + 2 * j; + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + B[i][j] = i + j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + matrix_vnni(MATRIX_K, MATRIX_N, (int8_t *)&B, (int8_t *)&Bvnni, 4); + big_matrix MBvnni((int8_t *)&Bvnni); + + bool sel = query_and_matrix_multiply(MC, MA, MBvnni); + bool res = true; + if (sel == 1) { + matrix_multiply_ref((int8_t *)&A, (int8_t *)&B, (int32_t *)&D, MATRIX_M, + MATRIX_N, MATRIX_K); + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + } else + ((sel == 0) ? std::cout << "default device selector has no TPU" << std::endl + : std::cout << "Selected combination is not supported by the " + "default device selector" + << std::endl); + return !res; +}