From 025cf7ecb4373e248841227d3724ad4f94062b8e Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:06:25 +0000
Subject: [PATCH 01/26] Added bfloat16 support for cuda backend. Added bfloat16
 in oneapi experimental namespace.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../sycl/ext/oneapi/experimental/bfloat16.hpp | 161 ++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
new file mode 100644
index 0000000000000..329094634d9ad
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -0,0 +1,161 @@
+//==--------- bfloat16.hpp ------- SYCL bfloat16 conversion ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/half_type.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
+  using storage_t = uint16_t;
+  storage_t value;
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Explicit conversion functions
+  static storage_t from_float(const float &a) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return __nvvm_f2bf16_rn(a);
+#else
+    return __spirv_ConvertFToBF16INTEL(a);
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 conversion is not supported on host device"};
+#endif
+  }
+  static float to_float(const storage_t &a) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    unsigned int y = a;
+    y = y << 16;
+    float *res = reinterpret_cast<float *>(&y);
+    return *res;
+#else
+    return __spirv_ConvertBF16ToFINTEL(a);
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 conversion is not supported on host device"};
+#endif
+  }
+
+  static bfloat16 from_bits(const storage_t &a) {
+    bfloat16 res;
+    res.value = a;
+    return res;
+  }
+
+  // Implicit conversion from float to bfloat16
+  bfloat16(const float &a) { value = from_float(a); }
+
+  bfloat16 &operator=(const float &rhs) {
+    value = from_float(rhs);
+    return *this;
+  }
+
+  // Implicit conversion from bfloat16 to float
+  operator float() const { return to_float(value); }
+  operator sycl::half() const { return to_float(value); }
+
+  // Get raw bits representation of bfloat16
+  storage_t raw() const { return value; }
+
+  // Logical operators (!,||,&&) are covered if we can cast to bool
+  explicit operator bool() { return to_float(value) != 0.0f; }
+
+  // Unary minus operator overloading
+  friend bfloat16 operator-(bfloat16 &lhs) {
+    return bfloat16{-to_float(lhs.value)};
+  }
+
+// Increment and decrement operators overloading
+#define OP(op)                                                                 \
+  friend bfloat16 &operator op(bfloat16 &lhs) {                                \
+    float f = to_float(lhs.value);                                             \
+    lhs.value = from_float(op f);                                              \
+    return lhs;                                                                \
+  }                                                                            \
+  friend bfloat16 operator op(bfloat16 &lhs, int) {                            \
+    bfloat16 old = lhs;                                                        \
+    operator op(lhs);                                                          \
+    return old;                                                                \
+  }
+  OP(++)
+  OP(--)
+#undef OP
+
+  // Assignment operators overloading
+#define OP(op)                                                                 \
+  friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend bfloat16 &operator op(bfloat16 &lhs, const T &rhs) {                  \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }                                                                            \
+  template <typename T> friend T &operator op(T &lhs, const bfloat16 &rhs) {   \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }
+  OP(+=)
+  OP(-=)
+  OP(*=)
+  OP(/=)
+#undef OP
+
+// Binary operators overloading
+#define OP(type, op)                                                           \
+  friend type operator op(const bfloat16 &lhs, const bfloat16 &rhs) {          \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend type operator op(const bfloat16 &lhs, const T &rhs) {                 \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }
+  OP(bfloat16, +)
+  OP(bfloat16, -)
+  OP(bfloat16, *)
+  OP(bfloat16, /)
+  OP(bool, ==)
+  OP(bool, !=)
+  OP(bool, <)
+  OP(bool, >)
+  OP(bool, <=)
+  OP(bool, >=)
+#undef OP
+
+  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
+  // for floating-point types.
+};
+
+} // namespace experimental
+} // namespace intel
+} // namespace ext
+
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)

From 66b4e3344bc7a9e514d857d4931ba26ed192b3f9 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:13:58 +0000
Subject: [PATCH 02/26] deleted intel namespace bfloat16.

---
 .../sycl/ext/intel/experimental/bfloat16.hpp  | 150 ------------------
 1 file changed, 150 deletions(-)
 delete mode 100644 sycl/include/sycl/ext/intel/experimental/bfloat16.hpp

diff --git a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp b/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
deleted file mode 100644
index 5a51f3746e225..0000000000000
--- a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//==--------- bfloat16.hpp ------- SYCL bfloat16 conversion ----------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <CL/__spirv/spirv_ops.hpp>
-#include <CL/sycl/half_type.hpp>
-
-__SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
-namespace ext {
-namespace intel {
-namespace experimental {
-
-class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
-  using storage_t = uint16_t;
-  storage_t value;
-
-public:
-  bfloat16() = default;
-  bfloat16(const bfloat16 &) = default;
-  ~bfloat16() = default;
-
-  // Explicit conversion functions
-  static storage_t from_float(const float &a) {
-#if defined(__SYCL_DEVICE_ONLY__)
-    return __spirv_ConvertFToBF16INTEL(a);
-#else
-    throw exception{errc::feature_not_supported,
-                    "Bfloat16 conversion is not supported on host device"};
-#endif
-  }
-  static float to_float(const storage_t &a) {
-#if defined(__SYCL_DEVICE_ONLY__)
-    return __spirv_ConvertBF16ToFINTEL(a);
-#else
-    throw exception{errc::feature_not_supported,
-                    "Bfloat16 conversion is not supported on host device"};
-#endif
-  }
-
-  static bfloat16 from_bits(const storage_t &a) {
-    bfloat16 res;
-    res.value = a;
-    return res;
-  }
-
-  // Implicit conversion from float to bfloat16
-  bfloat16(const float &a) { value = from_float(a); }
-
-  bfloat16 &operator=(const float &rhs) {
-    value = from_float(rhs);
-    return *this;
-  }
-
-  // Implicit conversion from bfloat16 to float
-  operator float() const { return to_float(value); }
-  operator sycl::half() const { return to_float(value); }
-
-  // Get raw bits representation of bfloat16
-  storage_t raw() const { return value; }
-
-  // Logical operators (!,||,&&) are covered if we can cast to bool
-  explicit operator bool() { return to_float(value) != 0.0f; }
-
-  // Unary minus operator overloading
-  friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
-  }
-
-// Increment and decrement operators overloading
-#define OP(op)                                                                 \
-  friend bfloat16 &operator op(bfloat16 &lhs) {                                \
-    float f = to_float(lhs.value);                                             \
-    lhs.value = from_float(op f);                                              \
-    return lhs;                                                                \
-  }                                                                            \
-  friend bfloat16 operator op(bfloat16 &lhs, int) {                            \
-    bfloat16 old = lhs;                                                        \
-    operator op(lhs);                                                          \
-    return old;                                                                \
-  }
-  OP(++)
-  OP(--)
-#undef OP
-
-  // Assignment operators overloading
-#define OP(op)                                                                 \
-  friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
-    float f = static_cast<float>(lhs);                                         \
-    f op static_cast<float>(rhs);                                              \
-    return lhs = f;                                                            \
-  }                                                                            \
-  template <typename T>                                                        \
-  friend bfloat16 &operator op(bfloat16 &lhs, const T &rhs) {                  \
-    float f = static_cast<float>(lhs);                                         \
-    f op static_cast<float>(rhs);                                              \
-    return lhs = f;                                                            \
-  }                                                                            \
-  template <typename T> friend T &operator op(T &lhs, const bfloat16 &rhs) {   \
-    float f = static_cast<float>(lhs);                                         \
-    f op static_cast<float>(rhs);                                              \
-    return lhs = f;                                                            \
-  }
-  OP(+=)
-  OP(-=)
-  OP(*=)
-  OP(/=)
-#undef OP
-
-// Binary operators overloading
-#define OP(type, op)                                                           \
-  friend type operator op(const bfloat16 &lhs, const bfloat16 &rhs) {          \
-    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
-  }                                                                            \
-  template <typename T>                                                        \
-  friend type operator op(const bfloat16 &lhs, const T &rhs) {                 \
-    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
-  }                                                                            \
-  template <typename T>                                                        \
-  friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
-    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
-  }
-  OP(bfloat16, +)
-  OP(bfloat16, -)
-  OP(bfloat16, *)
-  OP(bfloat16, /)
-  OP(bool, ==)
-  OP(bool, !=)
-  OP(bool, <)
-  OP(bool, >)
-  OP(bool, <=)
-  OP(bool, >=)
-#undef OP
-
-  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
-  // for floating-point types.
-};
-
-} // namespace experimental
-} // namespace intel
-} // namespace ext
-
-} // namespace sycl
-} // __SYCL_INLINE_NAMESPACE(cl)

From 2d04406d0198b5321cf2aaa870d395e9f042755b Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:29:32 +0000
Subject: [PATCH 03/26] Format.

---
 sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 329094634d9ad..ef1f01d5340ae 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -154,7 +154,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 };
 
 } // namespace experimental
-} // namespace intel
+} // namespace oneapi
 } // namespace ext
 
 } // namespace sycl

From 9418f74ee1e1a35918f5bcf99a3d0d57f29dac90 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 25 Jan 2022 14:35:02 +0000
Subject: [PATCH 04/26] Changed extension macro name.

---
 sycl/include/CL/sycl/feature_test.hpp.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index e6053ebf4ff1c..9bd849ca27d8a 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -46,7 +46,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY 1
 #define SYCL_EXT_ONEAPI_SRGB 1
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
-#define SYCL_EXT_INTEL_BF16_CONVERSION 1
+#define SYCL_EXT_ONEAPI_BF16_CONVERSION 1
 #define SYCL_EXT_INTEL_BITCAST 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__

From 4d99f3f97c06dc529d1d6b16df645b1af27fa8a6 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 17 Feb 2022 10:22:48 +0000
Subject: [PATCH 05/26] fixed test.

---
 sycl/test/extensions/bfloat16.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 6be5459642d0c..dd87806942f8a 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -2,10 +2,10 @@
 
 // UNSUPPORTED: cuda || hip_amd
 
-#include <sycl/ext/intel/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
-using sycl::ext::intel::experimental::bfloat16;
+using sycl::ext::oneapi::experimental::bfloat16;
 
 SYCL_EXTERNAL uint16_t some_bf16_intrinsic(uint16_t x, uint16_t y);
 SYCL_EXTERNAL void foo(long x, sycl::half y);

From f6cf7b865a64f9ea8c7890750143995454a32ffa Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Wed, 2 Mar 2022 11:45:40 +0000
Subject: [PATCH 06/26] Implemented fp19 mma using the natural storage type
 uint32_t.

Signed-off-by: jack.kirk <jack.kirk@codeplay.com>
---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp   |  21 +++-
 .../matrix/matrix-nvptx-fp19-test.cpp         | 112 ++++++++++++++++++
 2 files changed, 131 insertions(+), 2 deletions(-)
 create mode 100644 sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index 5c6df9114b161..cce65f54c0999 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -81,6 +81,10 @@ __SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2)
 __SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2)
 __SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8)
 
+  // m16n16k8 fp19
+__SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, a, 16, 8, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, b, 8, 16, int32_t, 4)
+
 #undef __SYCL_JOINT_MATRIX_OVERLOAD
 } // namespace experimental::matrix
 
@@ -271,7 +275,17 @@ struct joint_matrix_load_impl<
         __dmma_m8n8k4_ld_c(res.data, src.get(), stride,
                            get_layout_id<Layout>());
       }
-    }
+    } else if constexpr (std::is_same<T, uint32_t>::value) {
+          int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
+         if constexpr (NumRows == 16 && NumCols == 8) {
+           __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
+                                 get_layout_id<Layout>());
+        }
+        else if constexpr (NumRows == 8 && NumCols == 16) {
+           __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
+                                 get_layout_id<Layout>());
+        }
+        }
   }
 };
 
@@ -495,7 +509,10 @@ struct joint_matrix_mad_impl<
                                      get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       }
-    } else if constexpr (std::is_same<T1, double>::value) {
+    }  else if constexpr (M == 16 && N == 16 && K == 8) {
+      __mma_tf32_m16n16k8_mma_f32(D.data, A.data, B.data, C.data,
+                                  get_layout_pair_id<LayoutA, LayoutB>(), 0);
+    }  else if constexpr (std::is_same<T1, double>::value) {
       __dmma_m8n8k4_mma_f64(D.data, A.data, B.data, C.data,
                             get_layout_pair_id<LayoutA, LayoutB>(), 0);
     }
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
new file mode 100644
index 0000000000000..4ce05be7a8b1a
--- /dev/null
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
@@ -0,0 +1,112 @@
+// REQUIRES: cuda
+
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 -S -Xclang -emit-llvm %s -o -| FileCheck %s
+
+// IMPORTANT: before updating sm version support beyond sm_86 read the following NOTE!
+
+// NOTE: Technically the 'wrong' ptx instruction is called by joint_matrix_load/joint_matrix_store in this case:
+// notice that the load and store instructions use shape m16n16k16, rather than the correct shape m16n16k8.
+// The 'wrong' ptx instruction is used because it returns the correct SASS instructions for all existing supported sm versions:
+// sm_80 and sm_86. The Apparent reason for this ptx instruction redundancy is due to the ptx naming convention for the mnk shape triple;
+// however we cannot in principle a priori know that future sm versions will behave in the same way and that this redundancy will remain.
+// This should be validated before supporting any sm versions beyond sm_86.
+// The reason that we choose to use the m16n16k16 instruction is that it allows the significant advantage of being able
+// to use a portable interface across Intel and Nvidia backends.
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+// M, N, K define the sizes of dimensions of the three matrix types (a, b,
+// accumulator) used per subgroup operation.
+constexpr int M = 16; // number of rows of accumulator,
+                     // number of cols of b.
+constexpr int N = 16; // number of cols of accumulator,
+                     // number of rows of a.
+constexpr int K = 8; // number of cols of a/number of rows of b.
+
+uint32_t A[M * K];
+uint32_t B[K * N];
+float C[M * N];
+float D[M * N];
+
+int main() {
+
+  buffer<uint32_t, 1> bufA(A, range<1>(M * K));
+  buffer<uint32_t, 1> bufB(B, range<1>(K * N));
+  buffer<float, 1> bufC(C, range<1>(M * N));
+  buffer<float, 1> bufD(D, range<1>(M * N));
+
+  queue q;
+
+  q.submit([&](handler &cgh) {
+    auto accC = bufC.get_access<access::mode::read_write>(cgh);
+    auto accA = bufA.get_access<access::mode::read_write>(cgh);
+    auto accB = bufB.get_access<access::mode::read_write>(cgh);
+    auto accD = bufD.get_access<access::mode::read_write>(cgh);
+
+    cgh.parallel_for<class row_row>(
+        nd_range<2>({1, 32}, {1, 32}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<float, matrix_use::accumulator, M, N,
+                       matrix_layout::row_major>
+              sub_c;
+
+          joint_matrix<uint32_t, matrix_use::a, M, K, matrix_layout::row_major>
+              sub_a;
+
+          joint_matrix<uint32_t, matrix_use::b, K, N, matrix_layout::row_major>
+              sub_b;
+ 
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
+          joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
+          //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
+          joint_matrix_load(sg, sub_a, accA.get_pointer(), K);
+          //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 16) #{{.*}}
+          joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %10, i32 %11, i32 %12, i32 %13, i32 %15, i32 %16, i32 %17, i32 %18, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8) #{{.*}}
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1f32(float addrspace(1)* %_arg_14, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, i32 16) #{{.*}}
+          joint_matrix_store(sg, sub_c, accD.get_pointer(), N);
+        });
+  });
+
+  q.submit([&](handler &cgh) {
+    auto accC = bufC.get_access<access::mode::read_write>(cgh);
+    auto accA = bufA.get_access<access::mode::read_write>(cgh);
+    auto accB = bufB.get_access<access::mode::read_write>(cgh);
+    auto accD = bufD.get_access<access::mode::read_write>(cgh);
+
+    cgh.parallel_for<class col_col>(
+        nd_range<2>({1, 32}, {1, 32}),
+        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+          sycl::sub_group sg = item.get_sub_group();
+
+          joint_matrix<float, matrix_use::accumulator, M, N,
+                       matrix_layout::col_major>
+              sub_c;
+
+          joint_matrix<uint32_t, matrix_use::a, M, K, matrix_layout::col_major>
+              sub_a;
+
+          joint_matrix<uint32_t, matrix_use::b, K, N, matrix_layout::col_major>
+              sub_b;
+
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
+          joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
+          //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
+          joint_matrix_load(sg, sub_a, accA.get_pointer(), K);
+          //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 16) #{{.*}}
+          joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 %10, i32 %11, i32 %12, i32 %13, i32 %15, i32 %16, i32 %17, i32 %18, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8) #{{.*}}
+          sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
+          //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* %_arg_14, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, i32 16) #{{.*}}
+          joint_matrix_store(sg, sub_c, accD.get_pointer(), N);
+        });
+  });
+
+  return 0;
+};

From 35302b5b1d22b11d96dcd1c0021eae7d99ef4ea0 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Wed, 2 Mar 2022 12:37:42 +0000
Subject: [PATCH 07/26] format

---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp   | 15 +++++++------
 .../matrix/matrix-nvptx-fp19-test.cpp         | 21 ++++++++++++-------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index cce65f54c0999..ba23264b87ba4 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -276,16 +276,15 @@ struct joint_matrix_load_impl<
                            get_layout_id<Layout>());
       }
     } else if constexpr (std::is_same<T, uint32_t>::value) {
-          int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
-         if constexpr (NumRows == 16 && NumCols == 8) {
+      int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
+      if constexpr (NumRows == 16 && NumCols == 8) {
            __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
                                  get_layout_id<Layout>());
-        }
-        else if constexpr (NumRows == 8 && NumCols == 16) {
+      } else if constexpr (NumRows == 8 && NumCols == 16) {
            __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
                                  get_layout_id<Layout>());
-        }
-        }
+      }
+    }
   }
 };
 
@@ -509,10 +508,10 @@ struct joint_matrix_mad_impl<
                                      get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       }
-    }  else if constexpr (M == 16 && N == 16 && K == 8) {
+    } else if constexpr (M == 16 && N == 16 && K == 8) {
       __mma_tf32_m16n16k8_mma_f32(D.data, A.data, B.data, C.data,
                                   get_layout_pair_id<LayoutA, LayoutB>(), 0);
-    }  else if constexpr (std::is_same<T1, double>::value) {
+    } else if constexpr (std::is_same<T1, double>::value) {
       __dmma_m8n8k4_mma_f64(D.data, A.data, B.data, C.data,
                             get_layout_pair_id<LayoutA, LayoutB>(), 0);
     }
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
index 4ce05be7a8b1a..ac17e37601c80 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
@@ -4,14 +4,19 @@
 
 // IMPORTANT: before updating sm version support beyond sm_86 read the following NOTE!
 
-// NOTE: Technically the 'wrong' ptx instruction is called by joint_matrix_load/joint_matrix_store in this case:
-// notice that the load and store instructions use shape m16n16k16, rather than the correct shape m16n16k8.
-// The 'wrong' ptx instruction is used because it returns the correct SASS instructions for all existing supported sm versions:
-// sm_80 and sm_86. The Apparent reason for this ptx instruction redundancy is due to the ptx naming convention for the mnk shape triple;
-// however we cannot in principle a priori know that future sm versions will behave in the same way and that this redundancy will remain.
-// This should be validated before supporting any sm versions beyond sm_86.
-// The reason that we choose to use the m16n16k16 instruction is that it allows the significant advantage of being able
-// to use a portable interface across Intel and Nvidia backends.
+// NOTE: Technically the 'wrong' ptx instruction is called by
+// joint_matrix_load/joint_matrix_store in this case: notice that the load and
+// store instructions use shape m16n16k16, rather than the correct shape
+// m16n16k8. The 'wrong' ptx instruction is used because it returns the correct
+// SASS instructions for all existing supported sm versions: sm_80 and sm_86.
+// The reason for this ptx instruction redundancy is due to the ptx naming
+// convention for the mnk shape triple; however we cannot in principle a priori
+// know that future sm versions will behave in the same way and that this
+// redundancy will continue as future architecture is released. This should be
+// validated before supporting any sm versions beyond sm_86. The reason that we
+// choose to use the m16n16k16 instruction is that it allows the significant
+// advantage of being able to use a portable interface across Intel and Nvidia
+// backends.
 
 #include <CL/sycl.hpp>
 

From 712af980e261c42330603ac2ca410ebaa0d19c75 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Wed, 2 Mar 2022 13:15:57 +0000
Subject: [PATCH 08/26] format

---
 .../include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp | 6 +++---
 .../check_device_code/matrix/matrix-nvptx-fp19-test.cpp  | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index ba23264b87ba4..012d3eb5006f0 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -81,7 +81,7 @@ __SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2)
 __SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2)
 __SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8)
 
-  // m16n16k8 fp19
+// m16n16k8 fp19
 __SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, a, 16, 8, int32_t, 4)
 __SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, b, 8, 16, int32_t, 4)
 
@@ -278,10 +278,10 @@ struct joint_matrix_load_impl<
     } else if constexpr (std::is_same<T, uint32_t>::value) {
       int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
       if constexpr (NumRows == 16 && NumCols == 8) {
-           __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
+        __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
                                  get_layout_id<Layout>());
       } else if constexpr (NumRows == 8 && NumCols == 16) {
-           __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
+        __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
                                  get_layout_id<Layout>());
       }
     }
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
index ac17e37601c80..56deb5c52b6fe 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
@@ -2,7 +2,8 @@
 
 // RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 -DSYCL_EXT_ONEAPI_MATRIX=3 -S -Xclang -emit-llvm %s -o -| FileCheck %s
 
-// IMPORTANT: before updating sm version support beyond sm_86 read the following NOTE!
+// IMPORTANT: before updating sm version support beyond sm_86 read the following
+// NOTE!
 
 // NOTE: Technically the 'wrong' ptx instruction is called by
 // joint_matrix_load/joint_matrix_store in this case: notice that the load and
@@ -26,10 +27,10 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 // M, N, K define the sizes of dimensions of the three matrix types (a, b,
 // accumulator) used per subgroup operation.
 constexpr int M = 16; // number of rows of accumulator,
-                     // number of cols of b.
+                      // number of cols of b.
 constexpr int N = 16; // number of cols of accumulator,
-                     // number of rows of a.
-constexpr int K = 8; // number of cols of a/number of rows of b.
+                      // number of rows of a.
+constexpr int K = 8;  // number of cols of a/number of rows of b.
 
 uint32_t A[M * K];
 uint32_t B[K * N];

From 35306433b318befdb407e846be5a7a8eaac2a17a Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Wed, 2 Mar 2022 13:46:01 +0000
Subject: [PATCH 09/26] format

---
 sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
index 56deb5c52b6fe..4a3962c542ee5 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
@@ -66,7 +66,7 @@ int main() {
 
           joint_matrix<uint32_t, matrix_use::b, K, N, matrix_layout::row_major>
               sub_b;
- 
+
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}

From fa67ff986453a9959ccc244c72911f5d3179bab4 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 3 Mar 2022 15:48:22 +0000
Subject: [PATCH 10/26] added comment relating uint32_t to fp19

---
 sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
index 4a3962c542ee5..c183ae7ba17aa 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
@@ -32,6 +32,7 @@ constexpr int N = 16; // number of cols of accumulator,
                       // number of rows of a.
 constexpr int K = 8;  // number of cols of a/number of rows of b.
 
+// uint32_t is used in this test as the storage type for fp19
 uint32_t A[M * K];
 uint32_t B[K * N];
 float C[M * N];

From 3982001259745c617ad78d57dc67512e8d7ff6e9 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Fri, 4 Mar 2022 15:41:39 +0000
Subject: [PATCH 11/26] Used neg ptx7.0 builtin for unary minus

---
 clang/include/clang/Basic/BuiltinsNVPTX.def         |  5 +++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td              | 13 +++++++++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td            |  9 +++++++++
 .../sycl/ext/oneapi/experimental/bfloat16.hpp       | 13 +++++++++++--
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 449e4d1256944..955dbbaae8f0d 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -182,6 +182,11 @@ BUILTIN(__nvvm_fabs_ftz_f, "ff", "")
 BUILTIN(__nvvm_fabs_f, "ff", "")
 BUILTIN(__nvvm_fabs_d, "dd", "")
 
+// Neg
+
+TARGET_BUILTIN(__nvvm_neg_bf16, "ZUsZUs", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70))
+
 // Round
 
 BUILTIN(__nvvm_round_ftz_f, "ff", "")
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 33ba30d782ff3..b7b0813f05292 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -740,6 +740,19 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_fabs_d : GCCBuiltin<"__nvvm_fabs_d">,
       DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>;
 
+//
+// Neg bf16, bf16x2
+//
+
+  foreach unary = ["neg"] in {
+    def int_nvvm_ # unary # _bf16 :
+      GCCBuiltin<!strconcat("__nvvm_", unary, "_bf16")>,
+      DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>;
+    def int_nvvm_ # unary # _bf16x2 :
+      GCCBuiltin<!strconcat("__nvvm_", unary, "_bf16x2")>,
+      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  }
+
 //
 // Round
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ec004c5923ece..af9e1270bc5f5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -719,6 +719,15 @@ def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
 def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_fabs_d>;
 
+//
+// Neg bf16, bf16x2
+//
+
+def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", Int16Regs,
+  Int16Regs, int_nvvm_neg_bf16>;
+def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs,
+  Int32Regs, int_nvvm_neg_bf16x2>;
+
 //
 // Round
 //
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index ef1f01d5340ae..3768c65aab6a3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -42,7 +42,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
 #if defined(__NVPTX__)
-    unsigned int y = a;
+    uint32_t y = a;
     y = y << 16;
     float *res = reinterpret_cast<float *>(&y);
     return *res;
@@ -81,7 +81,16 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 
   // Unary minus operator overloading
   friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return from_bits(__nvvm_neg_bf16(lhs.value));
+#else
+    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 unary minus is not supported on host device"};
+#endif
   }
 
 // Increment and decrement operators overloading

From 8d2d11fecfbc1e9caebdc460c7a9cb19b4e18774 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Mon, 7 Mar 2022 16:47:32 +0000
Subject: [PATCH 12/26] Replaced SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc with
 SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc

---
 .../SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc  | 336 ++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
new file mode 100644
index 0000000000000..bf0a799671ffa
--- /dev/null
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
@@ -0,0 +1,336 @@
+= sycl_oneapi_bf16_conversion
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+// This is necessary for asciidoc, but not for asciidoctor
+:cpp: C++
+
+== Notice
+
+IMPORTANT: This specification is a draft.
+
+Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 specification, Revision 4.
+
+== Status
+
+Draft
+
+This is a preview extension specification, intended to provide early access to
+a feature for review and community feedback. When the feature matures, this
+specification may be released as a formal extension.
+
+Because the interfaces defined by this specification are not final and are
+subject to change they are not intended to be used by shipping software
+products.
+
+== Version
+
+Revision: 4
+
+== Introduction
+
+This extension adds functionality to convert value of single-precision
+floating-point type(`float`) to `bfloat16` type and vice versa. The extension
+doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
+type(`uint16_t`) as a storage for `bfloat16` values.
+
+The purpose of conversion from float to bfloat16 is to reduce ammount of memory
+required to store floating-point numbers. Computations are expected to be done with
+32-bit floating-point values.
+
+This extension is an optional kernel feature as described in
+https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7]
+of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this
+feature to a device that does not support it should cause a synchronous
+`errc::kernel_not_supported` exception to be thrown from the kernel invocation
+command (e.g. from `parallel_for`).
+
+== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros". Therefore, an implementation
+supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_BF16_CONVERSION` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro’s
+ value to determine which of the extension’s APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===
+
+== Extension to `enum class aspect`
+
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bf16_conversion
+}
+}
+----
+
+If a SYCL device has the `ext_oneapi_bf16_conversion` aspect, then it natively
+supports conversion of values of `float` type to `bfloat16` and back.
+
+If the device doesn't have the aspect, objects of `bfloat16` class must not be
+used in the device code.
+
+**NOTE**: The `ext_oneapi_bf16_conversion` aspect is not yet supported.  The
+`bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU.
+
+== New `bfloat16` class
+
+The `bfloat16` class below provides the conversion functionality. Conversion
+from `float` to `bfloat16` is done with round to nearest even(RTE) rounding
+mode.
+
+[source]
+----
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+class bfloat16 {
+  using storage_t = uint16_t;
+  storage_t value;
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Explicit conversion functions
+  static storage_t from_float(const float &a);
+  static float to_float(const storage_t &a);
+
+  // Convert from float to bfloat16
+  bfloat16(const float &a);
+  bfloat16 &operator=(const float &a);
+
+  // Convert from bfloat16 to float
+  operator float() const;
+
+  // Get bfloat16 as uint16.
+  operator storage_t() const;
+
+  // Convert to bool type
+  explicit operator bool();
+
+  friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }
+
+  // OP is: prefix ++, --
+  friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }
+
+  // OP is: postfix ++, --
+  friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }
+
+  // OP is: +=, -=, *=, /=
+  friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is +, -, *, /
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is ==,!=, <, >, <=, >=
+  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+};
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+----
+
+Table 1. Member functions of `bfloat16` class.
+|===
+| Member Function | Description
+
+|  `static storage_t from_float(const float &a);`
+|  Explicitly convert from `float` to `bfloat16`.
+
+|  `static float to_float(const storage_t &a);`
+|  Interpret `a` as `bfloat16` and explicitly convert it to `float`.
+
+| `bfloat16(const float& a);`
+| Construct `bfloat16` from `float`. Converts `float` to `bfloat16`.
+
+| `bfloat16 &operator=(const float &a);`
+| Replace the value with `a` converted to `bfloat16`
+
+| `operator float() const;`
+|  Return `bfloat16` value converted to `float`.
+
+| `operator storage_t() const;`
+| Return `uint16_t` value, whose bits represent `bfloat16` value.
+
+| `explicit operator bool() { /* ... */ }`
+| Convert `bfloat16` to `bool` type. Return `false` if the value equals to
+  zero, return `true` otherwise.
+
+| `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }`
+| Construct new instance of `bfloat16` class with negated value of the `bf`.
+
+| `friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }`
+| Perform an in-place `OP` prefix arithmetic operation on the `bf`,
+  assigning the result to the `bf` and return the `bf`.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }`
+| Perform an in-place `OP` postfix arithmetic operation on `bf`, assigning
+  the result to the `bf` and return a copy of `bf` before the operation is
+  performed.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs`
+  and return the `lhs`.
+
+  OP is: `+=, -=, *=, /=`
+
+| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` and `rhs` `bfloat16` values.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` value and `rhs` of template type `T`. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` of template type `T` and `rhs` `bfloat16` value. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
+  values and return the result as a boolean value.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` of
+  template type `T` and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` of template type `T` and `rhs`
+  `bfloat16` value and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+|===
+
+== Example
+
+[source]
+----
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
+using sycl::ext::oneapi::experimental::bfloat16;
+
+bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
+  return static_cast<float>(lhs) + static_cast<float>(rhs);
+}
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A {a};
+  bfloat16 B {b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-pointer
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main (int argc, char *argv[]) {
+  float data[3] = {7.0, 8.1, 0.0};
+  sycl::device dev;
+  sycl::queue deviceQueue{dev};
+  sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
+
+  if (dev.has(sycl::aspect::ext_oneapi_bf16_conversion)) {
+    deviceQueue.submit ([&] (sycl::handler& cgh) {
+      auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
+      cgh.single_task<class simple_kernel> ([=] () {
+        numbers[2] = foo(numbers[0], numbers[1]);
+      });
+    });
+  }
+  return 0;
+}
+----
+
+== Issues
+
+None.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2021-08-02|Alexey Sotkin |Initial public working draft
+|2|2021-08-17|Alexey Sotkin |Add explicit conversion functions +
+                             Add operator overloadings +
+                             Apply code review suggestions
+|3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
+|4|2022-03-07|Jack Kirk |Switch from Intel vendor specific to oneapi
+|========================================

From bfc68d22e3ef8166aa1d0476a25baf9ce6f59c15 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Thu, 10 Mar 2022 21:34:23 +0000
Subject: [PATCH 13/26] fp19 comments ->tf32

---
 sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp       | 2 +-
 .../{matrix-nvptx-fp19-test.cpp => matrix-nvptx-tf32-test.cpp}  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename sycl/test/check_device_code/matrix/{matrix-nvptx-fp19-test.cpp => matrix-nvptx-tf32-test.cpp} (99%)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index 012d3eb5006f0..df3b6e97d258e 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -81,7 +81,7 @@ __SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2)
 __SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2)
 __SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8)
 
-// m16n16k8 fp19
+// m16n16k8 tf32
 __SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, a, 16, 8, int32_t, 4)
 __SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, b, 8, 16, int32_t, 4)
 
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
similarity index 99%
rename from sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
rename to sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index c183ae7ba17aa..709005a45b160 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-fp19-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -32,7 +32,7 @@ constexpr int N = 16; // number of cols of accumulator,
                       // number of rows of a.
 constexpr int K = 8;  // number of cols of a/number of rows of b.
 
-// uint32_t is used in this test as the storage type for fp19
+// uint32_t is used in this test as the storage type for tf32
 uint32_t A[M * K];
 uint32_t B[K * N];
 float C[M * N];

From 8a29c4412c06b1246bdcd0fa5954b70957211e36 Mon Sep 17 00:00:00 2001
From: "jack.kirk" <jack.kirk@codeplay.com>
Date: Tue, 15 Mar 2022 14:41:54 +0000
Subject: [PATCH 14/26] Renamed extension to cover all bfloat16 funct.

Removed aspect reference: can be added once the ext_oneapi_bfloat16 aspect is merged.
---
 ....asciidoc => sycl_ext_oneapi_bfloat16.asciidoc} | 14 +++++++-------
 .../sycl/ext/oneapi/experimental/bfloat16.hpp      |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)
 rename sycl/doc/extensions/experimental/{SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc => sycl_ext_oneapi_bfloat16.asciidoc} (96%)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
similarity index 96%
rename from sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
index bf0a799671ffa..175219e23c47f 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_BF16_CONVERSION.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
@@ -1,4 +1,4 @@
-= sycl_oneapi_bf16_conversion
+= sycl_ext_oneapi_bfloat16
 
 :source-highlighter: coderay
 :coderay-linenums-mode: table
@@ -73,7 +73,7 @@ command (e.g. from `parallel_for`).
 This extension provides a feature-test macro as described in the core SYCL
 specification section 6.3.3 "Feature test macros". Therefore, an implementation
 supporting this extension must predefine the macro
-`SYCL_EXT_ONEAPI_BF16_CONVERSION` to one of the values defined in the table
+`SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
 below. Applications can test for the existence of this macro to determine if
 the implementation supports this feature, or applications can test the macro’s
  value to determine which of the extension’s APIs the implementation supports.
@@ -91,18 +91,18 @@ the implementation supports this feature, or applications can test the macro’s
 namespace sycl {
 enum class aspect {
   ...
-  ext_oneapi_bf16_conversion
+  ext_oneapi_bfloat16
 }
 }
 ----
 
-If a SYCL device has the `ext_oneapi_bf16_conversion` aspect, then it natively
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
 supports conversion of values of `float` type to `bfloat16` and back.
 
 If the device doesn't have the aspect, objects of `bfloat16` class must not be
 used in the device code.
 
-**NOTE**: The `ext_oneapi_bf16_conversion` aspect is not yet supported.  The
+**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
 `bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU.
 
 == New `bfloat16` class
@@ -304,7 +304,7 @@ int main (int argc, char *argv[]) {
   sycl::queue deviceQueue{dev};
   sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
 
-  if (dev.has(sycl::aspect::ext_oneapi_bf16_conversion)) {
+  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
     deviceQueue.submit ([&] (sycl::handler& cgh) {
       auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
       cgh.single_task<class simple_kernel> ([=] () {
@@ -332,5 +332,5 @@ None.
                              Add operator overloadings +
                              Apply code review suggestions
 |3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
-|4|2022-03-07|Jack Kirk |Switch from Intel vendor specific to oneapi
+|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
 |========================================
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 3768c65aab6a3..1190c80631928 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -17,7 +17,7 @@ namespace ext {
 namespace oneapi {
 namespace experimental {
 
-class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
+class bfloat16 {
   using storage_t = uint16_t;
   storage_t value;
 

From 52c8e20890e1e88dd1a4cff7812b3c75fd9dc8a6 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 17 Mar 2022 17:34:07 +0000
Subject: [PATCH 15/26] Changing impl to accept float with boolean switch to
 tell whether tf32 are being used

---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp   | 187 ++++++++++--------
 1 file changed, 100 insertions(+), 87 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index df3b6e97d258e..b3c794ded7665 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -21,69 +21,71 @@ enum class matrix_layout { row_major, col_major, packed_a, packed_b };
 template <typename T, matrix_use Use, size_t Rows = sycl::dynamic_extent,
           size_t Cols = sycl::dynamic_extent,
           matrix_layout Layout = matrix_layout::row_major,
-          typename Group = sycl::sub_group, typename Cond = void>
+          typename Group = sycl::sub_group, bool use_tf32 = false,
+          typename Cond = void>
 struct joint_matrix;
 
-#define __SYCL_JOINT_MATRIX_OVERLOAD(type, use, M, N, frag_type, frag_size)    \
+#define __SYCL_JOINT_MATRIX_OVERLOAD(type, use, M, N, frag_type, frag_size,    \
+                                     use_tf32)                                 \
   template <matrix_layout Layout>                                              \
   struct joint_matrix<                                                         \
-      type, matrix_use::use, M, N, Layout, sycl::sub_group,                    \
+      type, matrix_use::use, M, N, Layout, sycl::sub_group, use_tf32,          \
       typename std::enable_if_t<Layout == matrix_layout::row_major ||          \
                                 Layout == matrix_layout::col_major>> {         \
     frag_type data[frag_size];                                                 \
   };
 
 // m8n8k4 double only
-__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2, false)
 
 // m8n32k16
 // bf16 data format uses uint16_t data type
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4, false)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8, false)
 
 // m32n8k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4, false)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8, false)
 
 // m16n16k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4, false)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8, false)
 
 // m16n16k8 tf32
-__SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, a, 16, 8, int32_t, 4)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint32_t, b, 8, 16, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, a, 16, 8, float, 4, true)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, b, 8, 16, float, 4, true)
 
 #undef __SYCL_JOINT_MATRIX_OVERLOAD
 } // namespace experimental::matrix
@@ -93,11 +95,13 @@ namespace detail {
 template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
-          access::address_space Space, typename Cond = void>
+          access::address_space Space, bool use_tf32 = false,
+          typename Cond = void>
 struct joint_matrix_load_impl {
-  void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-                T, Use, NumRows, NumCols, Layout, sycl::sub_group> &res,
-            multi_ptr<T, Space> src, size_t stride);
+  void
+  load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
+           T, Use, NumRows, NumCols, Layout, sycl::sub_group, use_tf32> &res,
+       multi_ptr<T, Space> src, size_t stride);
 };
 
 template <sycl::ext::oneapi::experimental::matrix::matrix_layout Layout>
@@ -118,16 +122,17 @@ constexpr int get_layout_id<
 template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
-          access::address_space Space>
+          access::address_space Space, bool use_tf32>
 struct joint_matrix_load_impl<
-    T, Use, NumRows, NumCols, Layout, Space,
+    T, Use, NumRows, NumCols, Layout, Space, use_tf32,
     typename std::enable_if_t<Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::row_major ||
                               Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::col_major>> {
-  void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-                T, Use, NumRows, NumCols, Layout, sycl::sub_group> &res,
-            multi_ptr<T, Space> src, size_t stride) {
+  void
+  load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
+           T, Use, NumRows, NumCols, Layout, sycl::sub_group, use_tf32> &res,
+       multi_ptr<T, Space> src, size_t stride) {
     if constexpr (std::is_same<T, uint16_t>::value) {
       int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
       if constexpr (NumRows == 16 && NumCols == 16) {
@@ -251,15 +256,28 @@ struct joint_matrix_load_impl<
                              get_layout_id<Layout>());
       }
     } else if constexpr (std::is_same<T, float>::value) {
-      if constexpr (NumRows == 16 && NumCols == 16) {
-        __hmma_m16n16k16_ld_c_f32(res.data, src.get(), stride,
-                                  get_layout_id<Layout>());
-      } else if constexpr (NumRows == 8 && NumCols == 32) {
-        __hmma_m8n32k16_ld_c_f32(res.data, src.get(), stride,
-                                 get_layout_id<Layout>());
-      } else if constexpr (NumRows == 32 && NumCols == 8) {
-        __hmma_m32n8k16_ld_c_f32(res.data, src.get(), stride,
-                                 get_layout_id<Layout>());
+      if constexpr (use_tf32) {
+        // TODO make sure I am casting from float to tf32 correctly
+        int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
+        if constexpr (NumRows == 16 && NumCols == 8) {
+          __mma_tf32_m16n16k8_ld_a(reinterpret_cast<int32_t *>(res.data),
+                                   tileptr, stride, get_layout_id<Layout>());
+        } else if constexpr (NumRows == 8 && NumCols == 16) {
+          __mma_tf32_m16n16k8_ld_b(reinterpret_cast<int32_t *>(res.data),
+                                   tileptr, stride, get_layout_id<Layout>());
+        }
+        // END TODO
+      } else {
+        if constexpr (NumRows == 16 && NumCols == 16) {
+          __hmma_m16n16k16_ld_c_f32(res.data, src.get(), stride,
+                                    get_layout_id<Layout>());
+        } else if constexpr (NumRows == 8 && NumCols == 32) {
+          __hmma_m8n32k16_ld_c_f32(res.data, src.get(), stride,
+                                   get_layout_id<Layout>());
+        } else if constexpr (NumRows == 32 && NumCols == 8) {
+          __hmma_m32n8k16_ld_c_f32(res.data, src.get(), stride,
+                                   get_layout_id<Layout>());
+        }
       }
     } else if constexpr (std::is_same<T, double>::value) {
       if constexpr (Use ==
@@ -275,15 +293,6 @@ struct joint_matrix_load_impl<
         __dmma_m8n8k4_ld_c(res.data, src.get(), stride,
                            get_layout_id<Layout>());
       }
-    } else if constexpr (std::is_same<T, uint32_t>::value) {
-      int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
-      if constexpr (NumRows == 16 && NumCols == 8) {
-        __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
-                                 get_layout_id<Layout>());
-      } else if constexpr (NumRows == 8 && NumCols == 16) {
-        __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
-                                 get_layout_id<Layout>());
-      }
     }
   }
 };
@@ -360,18 +369,18 @@ template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          typename Cond = void>
+          bool use_tf32, typename Cond = void>
 struct joint_matrix_mad_impl {
   sycl::ext::oneapi::experimental::matrix::joint_matrix<
       T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator, M,
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group>
+          LayoutA, sycl::sub_group, use_tf32>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group>
+          LayoutB, sycl::sub_group, use_tf32>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -414,9 +423,10 @@ constexpr int get_layout_pair_id<
 template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
-          sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC>
+          sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
+          bool use_tf32>
 struct joint_matrix_mad_impl<
-    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC,
+    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, use_tf32,
     typename std::enable_if_t<
         (LayoutA == sycl::ext::oneapi::experimental::matrix::matrix_layout::
                         row_major ||
@@ -435,11 +445,11 @@ struct joint_matrix_mad_impl<
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group>
+          LayoutA, sycl::sub_group, use_tf32>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group>
+          LayoutB, sycl::sub_group, use_tf32>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -508,8 +518,9 @@ struct joint_matrix_mad_impl<
                                      get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       }
-    } else if constexpr (M == 16 && N == 16 && K == 8) {
-      __mma_tf32_m16n16k8_mma_f32(D.data, A.data, B.data, C.data,
+    } else if constexpr (M == 16 && N == 16 && K == 8 && use_tf32) {
+      __mma_tf32_m16n16k8_mma_f32(D.data, reinterpret_cast<int32_t *>(A.data),
+                                  reinterpret_cast<int *>(B.data), C.data,
                                   get_layout_pair_id<LayoutA, LayoutB>(), 0);
     } else if constexpr (std::is_same<T1, double>::value) {
       __dmma_m8n8k4_mma_f64(D.data, A.data, B.data, C.data,
@@ -524,13 +535,15 @@ struct joint_matrix_mad_impl<
 namespace experimental::matrix {
 
 template <typename Group, typename T, matrix_use Use, size_t NumRows,
-          size_t NumCols, matrix_layout Layout, access::address_space Space>
+          size_t NumCols, matrix_layout Layout, access::address_space Space,
+          bool use_tf32 = false>
 void joint_matrix_load(
-    Group sg, joint_matrix<T, Use, NumRows, NumCols, Layout, Group> &res,
+    Group sg,
+    joint_matrix<T, Use, NumRows, NumCols, Layout, Group, use_tf32> &res,
     multi_ptr<T, Space> src, size_t stride) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::ext::oneapi::detail::joint_matrix_load_impl<T, Use, NumRows, NumCols,
-                                                    Layout, Space>{}
+                                                    Layout, Space, use_tf32>{}
       .load(res, src, stride);
 #else
   (void)sg;
@@ -568,15 +581,15 @@ void joint_matrix_store(Group sg,
 
 template <typename Group, typename T1, typename T2, std::size_t M,
           std::size_t K, std::size_t N, matrix_layout LayoutA,
-          matrix_layout LayoutB, matrix_layout LayoutC>
+          matrix_layout LayoutB, matrix_layout LayoutC, bool use_tf32>
 joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
 joint_matrix_mad(
-    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group> A,
-    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group> B,
+    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group, use_tf32> A,
+    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group, use_tf32> B,
     joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   return sycl::ext::oneapi::detail::joint_matrix_mad_impl<
-             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC>{}
+             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, use_tf32>{}
       .mad(A, B, C);
 #else
   (void)sg;

From 813aa4bff77a5290f6b6913c7d56ae4224fd4041 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 23 Mar 2022 10:44:29 +0000
Subject: [PATCH 16/26] Final impl

---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp   | 171 ++++++++++--------
 1 file changed, 91 insertions(+), 80 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index b3c794ded7665..465331f6edd56 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -18,74 +18,80 @@ enum class matrix_use { a, b, accumulator };
 
 enum class matrix_layout { row_major, col_major, packed_a, packed_b };
 
+enum class use_tf32 { yes, no };
+
 template <typename T, matrix_use Use, size_t Rows = sycl::dynamic_extent,
           size_t Cols = sycl::dynamic_extent,
           matrix_layout Layout = matrix_layout::row_major,
-          typename Group = sycl::sub_group, bool use_tf32 = false,
+          typename Group = sycl::sub_group, use_tf32 Tf32 = use_tf32::no,
           typename Cond = void>
 struct joint_matrix;
 
 #define __SYCL_JOINT_MATRIX_OVERLOAD(type, use, M, N, frag_type, frag_size,    \
-                                     use_tf32)                                 \
+                                     Tf32)                                     \
   template <matrix_layout Layout>                                              \
   struct joint_matrix<                                                         \
-      type, matrix_use::use, M, N, Layout, sycl::sub_group, use_tf32,          \
+      type, matrix_use::use, M, N, Layout, sycl::sub_group, Tf32,              \
       typename std::enable_if_t<Layout == matrix_layout::row_major ||          \
                                 Layout == matrix_layout::col_major>> {         \
     frag_type data[frag_size];                                                 \
   };
 
 // m8n8k4 double only
-__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2, use_tf32::no)
 
 // m8n32k16
 // bf16 data format uses uint16_t data type
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4, false)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4, use_tf32::no)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8,
+                             use_tf32::no)
 
 // m32n8k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4, false)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4, use_tf32::no)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8,
+                             use_tf32::no)
 
 // m16n16k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4, false)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2, false)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8, false)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4,
+                             use_tf32::no)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8,
+                             use_tf32::no)
 
 // m16n16k8 tf32
-__SYCL_JOINT_MATRIX_OVERLOAD(float, a, 16, 8, float, 4, true)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, b, 8, 16, float, 4, true)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, a, 16, 8, int32_t, 4, use_tf32::yes)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, b, 8, 16, int32_t, 4, use_tf32::yes)
 
 #undef __SYCL_JOINT_MATRIX_OVERLOAD
 } // namespace experimental::matrix
@@ -95,13 +101,14 @@ namespace detail {
 template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
-          access::address_space Space, bool use_tf32 = false,
+          access::address_space Space,
+          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32 =
+              sycl::ext::oneapi::experimental::matrix::use_tf32::no,
           typename Cond = void>
 struct joint_matrix_load_impl {
-  void
-  load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-           T, Use, NumRows, NumCols, Layout, sycl::sub_group, use_tf32> &res,
-       multi_ptr<T, Space> src, size_t stride);
+  void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
+                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Tf32> &res,
+            multi_ptr<T, Space> src, size_t stride);
 };
 
 template <sycl::ext::oneapi::experimental::matrix::matrix_layout Layout>
@@ -122,17 +129,17 @@ constexpr int get_layout_id<
 template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
-          access::address_space Space, bool use_tf32>
+          access::address_space Space,
+          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32>
 struct joint_matrix_load_impl<
-    T, Use, NumRows, NumCols, Layout, Space, use_tf32,
+    T, Use, NumRows, NumCols, Layout, Space, Tf32,
     typename std::enable_if_t<Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::row_major ||
                               Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::col_major>> {
-  void
-  load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-           T, Use, NumRows, NumCols, Layout, sycl::sub_group, use_tf32> &res,
-       multi_ptr<T, Space> src, size_t stride) {
+  void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
+                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Tf32> &res,
+            multi_ptr<T, Space> src, size_t stride) {
     if constexpr (std::is_same<T, uint16_t>::value) {
       int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
       if constexpr (NumRows == 16 && NumCols == 16) {
@@ -256,17 +263,20 @@ struct joint_matrix_load_impl<
                              get_layout_id<Layout>());
       }
     } else if constexpr (std::is_same<T, float>::value) {
-      if constexpr (use_tf32) {
-        // TODO make sure I am casting from float to tf32 correctly
+      if constexpr (Tf32 ==
+                    sycl::ext::oneapi::experimental::matrix::use_tf32::yes) {
         int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
         if constexpr (NumRows == 16 && NumCols == 8) {
-          __mma_tf32_m16n16k8_ld_a(reinterpret_cast<int32_t *>(res.data),
-                                   tileptr, stride, get_layout_id<Layout>());
+          __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
+                                   get_layout_id<Layout>());
         } else if constexpr (NumRows == 8 && NumCols == 16) {
-          __mma_tf32_m16n16k8_ld_b(reinterpret_cast<int32_t *>(res.data),
-                                   tileptr, stride, get_layout_id<Layout>());
+          __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
+                                   get_layout_id<Layout>());
+        }
+        for (int i = 0; i < 4; ++i) {
+          auto tmpfloat = __nvvm_bitcast_i2f(res.data[i]);
+          res.data[i] = __nvvm_f2tf32_rna(tmpfloat);
         }
-        // END TODO
       } else {
         if constexpr (NumRows == 16 && NumCols == 16) {
           __hmma_m16n16k16_ld_c_f32(res.data, src.get(), stride,
@@ -369,18 +379,19 @@ template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          bool use_tf32, typename Cond = void>
+          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32,
+          typename Cond = void>
 struct joint_matrix_mad_impl {
   sycl::ext::oneapi::experimental::matrix::joint_matrix<
       T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator, M,
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group, use_tf32>
+          LayoutA, sycl::sub_group, Tf32>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group, use_tf32>
+          LayoutB, sycl::sub_group, Tf32>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -424,9 +435,9 @@ template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          bool use_tf32>
+          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32>
 struct joint_matrix_mad_impl<
-    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, use_tf32,
+    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Tf32,
     typename std::enable_if_t<
         (LayoutA == sycl::ext::oneapi::experimental::matrix::matrix_layout::
                         row_major ||
@@ -445,11 +456,11 @@ struct joint_matrix_mad_impl<
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group, use_tf32>
+          LayoutA, sycl::sub_group, Tf32>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group, use_tf32>
+          LayoutB, sycl::sub_group, Tf32>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -518,9 +529,10 @@ struct joint_matrix_mad_impl<
                                      get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       }
-    } else if constexpr (M == 16 && N == 16 && K == 8 && use_tf32) {
-      __mma_tf32_m16n16k8_mma_f32(D.data, reinterpret_cast<int32_t *>(A.data),
-                                  reinterpret_cast<int *>(B.data), C.data,
+    } else if constexpr (M == 16 && N == 16 && K == 8 &&
+                         Tf32 == sycl::ext::oneapi::experimental::matrix::
+                                     use_tf32::yes) {
+      __mma_tf32_m16n16k8_mma_f32(D.data, A.data, B.data, C.data,
                                   get_layout_pair_id<LayoutA, LayoutB>(), 0);
     } else if constexpr (std::is_same<T1, double>::value) {
       __dmma_m8n8k4_mma_f64(D.data, A.data, B.data, C.data,
@@ -536,14 +548,13 @@ namespace experimental::matrix {
 
 template <typename Group, typename T, matrix_use Use, size_t NumRows,
           size_t NumCols, matrix_layout Layout, access::address_space Space,
-          bool use_tf32 = false>
+          use_tf32 Tf32 = use_tf32::no>
 void joint_matrix_load(
-    Group sg,
-    joint_matrix<T, Use, NumRows, NumCols, Layout, Group, use_tf32> &res,
+    Group sg, joint_matrix<T, Use, NumRows, NumCols, Layout, Group, Tf32> &res,
     multi_ptr<T, Space> src, size_t stride) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::ext::oneapi::detail::joint_matrix_load_impl<T, Use, NumRows, NumCols,
-                                                    Layout, Space, use_tf32>{}
+                                                    Layout, Space, Tf32>{}
       .load(res, src, stride);
 #else
   (void)sg;
@@ -581,15 +592,15 @@ void joint_matrix_store(Group sg,
 
 template <typename Group, typename T1, typename T2, std::size_t M,
           std::size_t K, std::size_t N, matrix_layout LayoutA,
-          matrix_layout LayoutB, matrix_layout LayoutC, bool use_tf32>
+          matrix_layout LayoutB, matrix_layout LayoutC, use_tf32 Tf32>
 joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
 joint_matrix_mad(
-    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group, use_tf32> A,
-    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group, use_tf32> B,
+    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group, Tf32> A,
+    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group, Tf32> B,
     joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   return sycl::ext::oneapi::detail::joint_matrix_mad_impl<
-             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, use_tf32>{}
+             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Tf32>{}
       .mad(A, B, C);
 #else
   (void)sg;

From 0630667c328927017ecf3157241b91d7b2d95c8e Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 23 Mar 2022 11:24:22 +0000
Subject: [PATCH 17/26] Adding sycl test

---
 .../matrix/matrix-nvptx-tf32-test.cpp         | 42 ++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index 709005a45b160..960122a0f54e5 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -32,25 +32,25 @@ constexpr int N = 16; // number of cols of accumulator,
                       // number of rows of a.
 constexpr int K = 8;  // number of cols of a/number of rows of b.
 
-// uint32_t is used in this test as the storage type for tf32
-uint32_t A[M * K];
-uint32_t B[K * N];
+// float is used in this test as the storage type for tf32
+float A[M * K];
+float B[K * N];
 float C[M * N];
 float D[M * N];
 
 int main() {
 
-  buffer<uint32_t, 1> bufA(A, range<1>(M * K));
-  buffer<uint32_t, 1> bufB(B, range<1>(K * N));
+  buffer<float, 1> bufA(A, range<1>(M * K)); // will be used as tf32
+  buffer<float, 1> bufB(B, range<1>(K * N)); // will be used as tf32
   buffer<float, 1> bufC(C, range<1>(M * N));
   buffer<float, 1> bufD(D, range<1>(M * N));
 
   queue q;
 
   q.submit([&](handler &cgh) {
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
     auto accA = bufA.get_access<access::mode::read_write>(cgh);
     auto accB = bufB.get_access<access::mode::read_write>(cgh);
+    auto accC = bufC.get_access<access::mode::read_write>(cgh);
     auto accD = bufD.get_access<access::mode::read_write>(cgh);
 
     cgh.parallel_for<class row_row>(
@@ -58,16 +58,18 @@ int main() {
         [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
           sycl::sub_group sg = item.get_sub_group();
 
-          joint_matrix<float, matrix_use::accumulator, M, N,
-                       matrix_layout::row_major>
-              sub_c;
-
-          joint_matrix<uint32_t, matrix_use::a, M, K, matrix_layout::row_major>
+          joint_matrix<float, matrix_use::a, M, K, matrix_layout::row_major,
+                       sycl::sub_group, use_tf32::yes>
               sub_a;
 
-          joint_matrix<uint32_t, matrix_use::b, K, N, matrix_layout::row_major>
+          joint_matrix<float, matrix_use::b, K, N, matrix_layout::row_major,
+                       sycl::sub_group, use_tf32::yes>
               sub_b;
 
+          joint_matrix<float, matrix_use::accumulator, M, N,
+                       matrix_layout::row_major>
+              sub_c;
+
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
@@ -82,9 +84,9 @@ int main() {
   });
 
   q.submit([&](handler &cgh) {
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
     auto accA = bufA.get_access<access::mode::read_write>(cgh);
     auto accB = bufB.get_access<access::mode::read_write>(cgh);
+    auto accC = bufC.get_access<access::mode::read_write>(cgh);
     auto accD = bufD.get_access<access::mode::read_write>(cgh);
 
     cgh.parallel_for<class col_col>(
@@ -92,16 +94,18 @@ int main() {
         [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
           sycl::sub_group sg = item.get_sub_group();
 
-          joint_matrix<float, matrix_use::accumulator, M, N,
-                       matrix_layout::col_major>
-              sub_c;
-
-          joint_matrix<uint32_t, matrix_use::a, M, K, matrix_layout::col_major>
+          joint_matrix<float, matrix_use::a, M, K, matrix_layout::col_major,
+                       sycl::sub_group, use_tf32::yes>
               sub_a;
 
-          joint_matrix<uint32_t, matrix_use::b, K, N, matrix_layout::col_major>
+          joint_matrix<float, matrix_use::b, K, N, matrix_layout::col_major,
+                       sycl::sub_group, use_tf32::yes>
               sub_b;
 
+          joint_matrix<float, matrix_use::accumulator, M, N,
+                       matrix_layout::col_major>
+              sub_c;
+
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}

From 61b3d8f7ab9ece2e9b5c2998ff9e34f27cd20105 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 23 Mar 2022 12:29:52 +0000
Subject: [PATCH 18/26] Device code check passing

---
 .../matrix/matrix-nvptx-tf32-test.cpp                | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index 960122a0f54e5..d2866d5baad69 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -70,15 +70,15 @@ int main() {
                        matrix_layout::row_major>
               sub_c;
 
-          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_accC, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
           joint_matrix_load(sg, sub_a, accA.get_pointer(), K);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
-          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 %10, i32 %11, i32 %12, i32 %13, i32 %15, i32 %16, i32 %17, i32 %18, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8) #{{.*}}
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-          //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1f32(float addrspace(1)* %_arg_14, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, i32 16) #{{.*}}
+          //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 {{.*}}
           joint_matrix_store(sg, sub_c, accD.get_pointer(), N);
         });
   });
@@ -106,15 +106,15 @@ int main() {
                        matrix_layout::col_major>
               sub_c;
 
-          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* %_arg_, i32 16) #{{.*}}
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, i32 {{.*}}) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
           joint_matrix_load(sg, sub_a, accA.get_pointer(), K);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
-          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 %10, i32 %11, i32 %12, i32 %13, i32 %15, i32 %16, i32 %17, i32 %18, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8) #{{.*}}
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
-          //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* %_arg_14, float %20, float %21, float %22, float %23, float %24, float %25, float %26, float %27, i32 16) #{{.*}}
+          //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16) #{{.*}}
           joint_matrix_store(sg, sub_c, accD.get_pointer(), N);
         });
   });

From 23cb7daefe053300b227cdbb2ec20b2eeaa7adc1 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 24 Mar 2022 16:05:02 +0000
Subject: [PATCH 19/26] Changing to precision enum

---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp   | 156 ++++++++++--------
 .../matrix/matrix-nvptx-tf32-test.cpp         |   8 +-
 2 files changed, 90 insertions(+), 74 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index 465331f6edd56..22dcbe85120e8 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -18,80 +18,96 @@ enum class matrix_use { a, b, accumulator };
 
 enum class matrix_layout { row_major, col_major, packed_a, packed_b };
 
-enum class use_tf32 { yes, no };
+enum class precision { standard, tf32 /* TODO add more precisions*/ };
 
 template <typename T, matrix_use Use, size_t Rows = sycl::dynamic_extent,
           size_t Cols = sycl::dynamic_extent,
           matrix_layout Layout = matrix_layout::row_major,
-          typename Group = sycl::sub_group, use_tf32 Tf32 = use_tf32::no,
-          typename Cond = void>
+          typename Group = sycl::sub_group,
+          precision Prec = precision::standard, typename Cond = void>
 struct joint_matrix;
 
 #define __SYCL_JOINT_MATRIX_OVERLOAD(type, use, M, N, frag_type, frag_size,    \
-                                     Tf32)                                     \
+                                     Prec)                                     \
   template <matrix_layout Layout>                                              \
   struct joint_matrix<                                                         \
-      type, matrix_use::use, M, N, Layout, sycl::sub_group, Tf32,              \
+      type, matrix_use::use, M, N, Layout, sycl::sub_group, Prec,              \
       typename std::enable_if_t<Layout == matrix_layout::row_major ||          \
                                 Layout == matrix_layout::col_major>> {         \
     frag_type data[frag_size];                                                 \
   };
 
 // m8n8k4 double only
-__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2,
+                             precision::standard)
 
 // m8n32k16
 // bf16 data format uses uint16_t data type
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4, use_tf32::no)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4,
+                             precision::standard)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4,
+                             precision::standard)
 __SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8,
-                             use_tf32::no)
+                             precision::standard)
 
 // m32n8k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4, use_tf32::no)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4,
+                             precision::standard)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1, precision::standard)
 __SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8,
-                             use_tf32::no)
+                             precision::standard)
 
 // m16n16k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8, use_tf32::no)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8,
+                             precision::standard)
 __SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4,
-                             use_tf32::no)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2, use_tf32::no)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2, use_tf32::no)
+                             precision::standard)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2, precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2,
+                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2,
+                             precision::standard)
 __SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8,
-                             use_tf32::no)
+                             precision::standard)
 
 // m16n16k8 tf32
-__SYCL_JOINT_MATRIX_OVERLOAD(float, a, 16, 8, int32_t, 4, use_tf32::yes)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, b, 8, 16, int32_t, 4, use_tf32::yes)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, a, 16, 8, int32_t, 4, precision::tf32)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, b, 8, 16, int32_t, 4, precision::tf32)
 
 #undef __SYCL_JOINT_MATRIX_OVERLOAD
 } // namespace experimental::matrix
@@ -102,12 +118,12 @@ template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
           access::address_space Space,
-          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32 =
-              sycl::ext::oneapi::experimental::matrix::use_tf32::no,
+          sycl::ext::oneapi::experimental::matrix::precision Prec =
+              sycl::ext::oneapi::experimental::matrix::precision::standard,
           typename Cond = void>
 struct joint_matrix_load_impl {
   void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Tf32> &res,
+                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Prec> &res,
             multi_ptr<T, Space> src, size_t stride);
 };
 
@@ -130,15 +146,15 @@ template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
           access::address_space Space,
-          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32>
+          sycl::ext::oneapi::experimental::matrix::precision Prec>
 struct joint_matrix_load_impl<
-    T, Use, NumRows, NumCols, Layout, Space, Tf32,
+    T, Use, NumRows, NumCols, Layout, Space, Prec,
     typename std::enable_if_t<Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::row_major ||
                               Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::col_major>> {
   void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Tf32> &res,
+                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Prec> &res,
             multi_ptr<T, Space> src, size_t stride) {
     if constexpr (std::is_same<T, uint16_t>::value) {
       int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
@@ -263,8 +279,8 @@ struct joint_matrix_load_impl<
                              get_layout_id<Layout>());
       }
     } else if constexpr (std::is_same<T, float>::value) {
-      if constexpr (Tf32 ==
-                    sycl::ext::oneapi::experimental::matrix::use_tf32::yes) {
+      if constexpr (Prec ==
+                    sycl::ext::oneapi::experimental::matrix::precision::tf32) {
         int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
         if constexpr (NumRows == 16 && NumCols == 8) {
           __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
@@ -379,7 +395,7 @@ template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32,
+          sycl::ext::oneapi::experimental::matrix::precision Prec,
           typename Cond = void>
 struct joint_matrix_mad_impl {
   sycl::ext::oneapi::experimental::matrix::joint_matrix<
@@ -387,11 +403,11 @@ struct joint_matrix_mad_impl {
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group, Tf32>
+          LayoutA, sycl::sub_group, Prec>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group, Tf32>
+          LayoutB, sycl::sub_group, Prec>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -435,9 +451,9 @@ template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          sycl::ext::oneapi::experimental::matrix::use_tf32 Tf32>
+          sycl::ext::oneapi::experimental::matrix::precision Prec>
 struct joint_matrix_mad_impl<
-    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Tf32,
+    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Prec,
     typename std::enable_if_t<
         (LayoutA == sycl::ext::oneapi::experimental::matrix::matrix_layout::
                         row_major ||
@@ -456,11 +472,11 @@ struct joint_matrix_mad_impl<
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group, Tf32>
+          LayoutA, sycl::sub_group, Prec>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group, Tf32>
+          LayoutB, sycl::sub_group, Prec>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -530,8 +546,8 @@ struct joint_matrix_mad_impl<
         }
       }
     } else if constexpr (M == 16 && N == 16 && K == 8 &&
-                         Tf32 == sycl::ext::oneapi::experimental::matrix::
-                                     use_tf32::yes) {
+                         Prec == sycl::ext::oneapi::experimental::matrix::
+                                     precision::tf32) {
       __mma_tf32_m16n16k8_mma_f32(D.data, A.data, B.data, C.data,
                                   get_layout_pair_id<LayoutA, LayoutB>(), 0);
     } else if constexpr (std::is_same<T1, double>::value) {
@@ -548,13 +564,13 @@ namespace experimental::matrix {
 
 template <typename Group, typename T, matrix_use Use, size_t NumRows,
           size_t NumCols, matrix_layout Layout, access::address_space Space,
-          use_tf32 Tf32 = use_tf32::no>
+          precision Prec = precision::standard>
 void joint_matrix_load(
-    Group sg, joint_matrix<T, Use, NumRows, NumCols, Layout, Group, Tf32> &res,
+    Group sg, joint_matrix<T, Use, NumRows, NumCols, Layout, Group, Prec> &res,
     multi_ptr<T, Space> src, size_t stride) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   sycl::ext::oneapi::detail::joint_matrix_load_impl<T, Use, NumRows, NumCols,
-                                                    Layout, Space, Tf32>{}
+                                                    Layout, Space, Prec>{}
       .load(res, src, stride);
 #else
   (void)sg;
@@ -592,15 +608,15 @@ void joint_matrix_store(Group sg,
 
 template <typename Group, typename T1, typename T2, std::size_t M,
           std::size_t K, std::size_t N, matrix_layout LayoutA,
-          matrix_layout LayoutB, matrix_layout LayoutC, use_tf32 Tf32>
+          matrix_layout LayoutB, matrix_layout LayoutC, precision Prec>
 joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
 joint_matrix_mad(
-    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group, Tf32> A,
-    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group, Tf32> B,
+    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group, Prec> A,
+    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group, Prec> B,
     joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   return sycl::ext::oneapi::detail::joint_matrix_mad_impl<
-             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Tf32>{}
+             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Prec>{}
       .mad(A, B, C);
 #else
   (void)sg;
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index d2866d5baad69..1e4b1a7f18b22 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -59,11 +59,11 @@ int main() {
           sycl::sub_group sg = item.get_sub_group();
 
           joint_matrix<float, matrix_use::a, M, K, matrix_layout::row_major,
-                       sycl::sub_group, use_tf32::yes>
+                       sycl::sub_group, precision::tf32>
               sub_a;
 
           joint_matrix<float, matrix_use::b, K, N, matrix_layout::row_major,
-                       sycl::sub_group, use_tf32::yes>
+                       sycl::sub_group, precision::tf32>
               sub_b;
 
           joint_matrix<float, matrix_use::accumulator, M, N,
@@ -95,11 +95,11 @@ int main() {
           sycl::sub_group sg = item.get_sub_group();
 
           joint_matrix<float, matrix_use::a, M, K, matrix_layout::col_major,
-                       sycl::sub_group, use_tf32::yes>
+                       sycl::sub_group, precision::tf32>
               sub_a;
 
           joint_matrix<float, matrix_use::b, K, N, matrix_layout::col_major,
-                       sycl::sub_group, use_tf32::yes>
+                       sycl::sub_group, precision::tf32>
               sub_b;
 
           joint_matrix<float, matrix_use::accumulator, M, N,

From be60cdd060477406bf4dd8037357a0de5b60a920 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 31 Mar 2022 15:13:23 +0100
Subject: [PATCH 20/26] Responding to comments. Using precision::tf32 as empty
 class and float as the fragment type. Also adding free functions
 tf32_to_float and float_to_tf32

---
 .../ext/oneapi/matrix/matrix-tensorcore.hpp   | 223 +++++++++---------
 .../matrix/matrix-nvptx-tf32-test.cpp         |  40 +++-
 2 files changed, 136 insertions(+), 127 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index 22dcbe85120e8..5581a46ab27d9 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -18,112 +18,90 @@ enum class matrix_use { a, b, accumulator };
 
 enum class matrix_layout { row_major, col_major, packed_a, packed_b };
 
-enum class precision { standard, tf32 /* TODO add more precisions*/ };
+namespace precision {
+class tf32 {};
+} // namespace precision
 
 template <typename T, matrix_use Use, size_t Rows = sycl::dynamic_extent,
           size_t Cols = sycl::dynamic_extent,
           matrix_layout Layout = matrix_layout::row_major,
-          typename Group = sycl::sub_group,
-          precision Prec = precision::standard, typename Cond = void>
+          typename Group = sycl::sub_group, typename Cond = void>
 struct joint_matrix;
 
-#define __SYCL_JOINT_MATRIX_OVERLOAD(type, use, M, N, frag_type, frag_size,    \
-                                     Prec)                                     \
+#define __SYCL_JOINT_MATRIX_OVERLOAD(type, use, M, N, frag_type, frag_size)    \
   template <matrix_layout Layout>                                              \
   struct joint_matrix<                                                         \
-      type, matrix_use::use, M, N, Layout, sycl::sub_group, Prec,              \
+      type, matrix_use::use, M, N, Layout, sycl::sub_group,                    \
       typename std::enable_if_t<Layout == matrix_layout::row_major ||          \
                                 Layout == matrix_layout::col_major>> {         \
     frag_type data[frag_size];                                                 \
   };
 
 // m8n8k4 double only
-__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2,
-                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, a, 8, 4, double, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, b, 4, 8, double, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD(double, accumulator, 8, 8, double, 2)
 
 // m8n32k16
 // bf16 data format uses uint16_t data type
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4,
-                             precision::standard)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8,
-                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 8, 16, int32_t, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 32, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 8, 16, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 32, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 8, 32, float, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 8, 32, int32_t, 4)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 8, 16, int32_t, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 32, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 8, 16, int32_t, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 32, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 8, 32, int32_t, 8)
 
 // m32n8k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4,
-                             precision::standard)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8,
-                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 32, 16, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 8, int32_t, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 32, 16, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 8, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 32, 8, float, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 32, 8, int32_t, 4)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 32, 16, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 8, int32_t, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 32, 16, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 8, int32_t, 1)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 32, 8, int32_t, 8)
 
 // m16n16k16
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4,
-                             precision::standard)
-
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2, precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2,
-                             precision::standard)
-__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8,
-                             precision::standard)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, a, 16, 16, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint16_t, b, 16, 16, int32_t, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, a, 16, 16, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, b, 16, 16, int32_t, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(float, accumulator, 16, 16, float, 8)
+__SYCL_JOINT_MATRIX_OVERLOAD(half, accumulator, 16, 16, int32_t, 4)
+
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, a, 16, 16, int32_t, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(int8_t, b, 16, 16, int32_t, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, a, 16, 16, int32_t, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(uint8_t, b, 16, 16, int32_t, 2)
+__SYCL_JOINT_MATRIX_OVERLOAD(int32_t, accumulator, 16, 16, int32_t, 8)
 
 // m16n16k8 tf32
-__SYCL_JOINT_MATRIX_OVERLOAD(float, a, 16, 8, int32_t, 4, precision::tf32)
-__SYCL_JOINT_MATRIX_OVERLOAD(float, b, 8, 16, int32_t, 4, precision::tf32)
+__SYCL_JOINT_MATRIX_OVERLOAD(precision::tf32, a, 16, 8, float, 4)
+__SYCL_JOINT_MATRIX_OVERLOAD(precision::tf32, b, 8, 16, float, 4)
 
 #undef __SYCL_JOINT_MATRIX_OVERLOAD
 } // namespace experimental::matrix
 
 namespace detail {
 
-template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
+template <typename S, typename T,
+          sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
-          access::address_space Space,
-          sycl::ext::oneapi::experimental::matrix::precision Prec =
-              sycl::ext::oneapi::experimental::matrix::precision::standard,
-          typename Cond = void>
+          access::address_space Space, typename Cond = void>
 struct joint_matrix_load_impl {
   void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Prec> &res,
+                S, Use, NumRows, NumCols, Layout, sycl::sub_group> &res,
             multi_ptr<T, Space> src, size_t stride);
 };
 
@@ -142,19 +120,19 @@ constexpr int get_layout_id<
   return 1;
 }
 
-template <typename T, sycl::ext::oneapi::experimental::matrix::matrix_use Use,
+template <typename S, typename T,
+          sycl::ext::oneapi::experimental::matrix::matrix_use Use,
           size_t NumRows, size_t NumCols,
           sycl::ext::oneapi::experimental::matrix::matrix_layout Layout,
-          access::address_space Space,
-          sycl::ext::oneapi::experimental::matrix::precision Prec>
+          access::address_space Space>
 struct joint_matrix_load_impl<
-    T, Use, NumRows, NumCols, Layout, Space, Prec,
+    S, T, Use, NumRows, NumCols, Layout, Space,
     typename std::enable_if_t<Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::row_major ||
                               Layout == sycl::ext::oneapi::experimental::
                                             matrix::matrix_layout::col_major>> {
   void load(sycl::ext::oneapi::experimental::matrix::joint_matrix<
-                T, Use, NumRows, NumCols, Layout, sycl::sub_group, Prec> &res,
+                S, Use, NumRows, NumCols, Layout, sycl::sub_group> &res,
             multi_ptr<T, Space> src, size_t stride) {
     if constexpr (std::is_same<T, uint16_t>::value) {
       int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
@@ -279,21 +257,7 @@ struct joint_matrix_load_impl<
                              get_layout_id<Layout>());
       }
     } else if constexpr (std::is_same<T, float>::value) {
-      if constexpr (Prec ==
-                    sycl::ext::oneapi::experimental::matrix::precision::tf32) {
-        int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
-        if constexpr (NumRows == 16 && NumCols == 8) {
-          __mma_tf32_m16n16k8_ld_a(res.data, tileptr, stride,
-                                   get_layout_id<Layout>());
-        } else if constexpr (NumRows == 8 && NumCols == 16) {
-          __mma_tf32_m16n16k8_ld_b(res.data, tileptr, stride,
-                                   get_layout_id<Layout>());
-        }
-        for (int i = 0; i < 4; ++i) {
-          auto tmpfloat = __nvvm_bitcast_i2f(res.data[i]);
-          res.data[i] = __nvvm_f2tf32_rna(tmpfloat);
-        }
-      } else {
+      if (std::is_same<S, float>::value) {
         if constexpr (NumRows == 16 && NumCols == 16) {
           __hmma_m16n16k16_ld_c_f32(res.data, src.get(), stride,
                                     get_layout_id<Layout>());
@@ -304,6 +268,16 @@ struct joint_matrix_load_impl<
           __hmma_m32n8k16_ld_c_f32(res.data, src.get(), stride,
                                    get_layout_id<Layout>());
         }
+      } else if (std::is_same<S, sycl::ext::oneapi::experimental::matrix::
+                                     precision::tf32>::value) {
+        int32_t *tileptr = reinterpret_cast<int32_t *>(src.get());
+        if constexpr (NumRows == 16 && NumCols == 8) {
+          __mma_tf32_m16n16k8_ld_a(reinterpret_cast<int32_t *>(res.data),
+                                   tileptr, stride, get_layout_id<Layout>());
+        } else if constexpr (NumRows == 8 && NumCols == 16) {
+          __mma_tf32_m16n16k8_ld_b(reinterpret_cast<int32_t *>(res.data),
+                                   tileptr, stride, get_layout_id<Layout>());
+        }
       }
     } else if constexpr (std::is_same<T, double>::value) {
       if constexpr (Use ==
@@ -395,7 +369,6 @@ template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          sycl::ext::oneapi::experimental::matrix::precision Prec,
           typename Cond = void>
 struct joint_matrix_mad_impl {
   sycl::ext::oneapi::experimental::matrix::joint_matrix<
@@ -403,11 +376,11 @@ struct joint_matrix_mad_impl {
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group, Prec>
+          LayoutA, sycl::sub_group>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group, Prec>
+          LayoutB, sycl::sub_group>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -450,10 +423,9 @@ constexpr int get_layout_pair_id<
 template <typename T1, typename T2, std::size_t M, std::size_t K, std::size_t N,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutA,
           sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutB,
-          sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC,
-          sycl::ext::oneapi::experimental::matrix::precision Prec>
+          sycl::ext::oneapi::experimental::matrix::matrix_layout LayoutC>
 struct joint_matrix_mad_impl<
-    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Prec,
+    T1, T2, M, K, N, LayoutA, LayoutB, LayoutC,
     typename std::enable_if_t<
         (LayoutA == sycl::ext::oneapi::experimental::matrix::matrix_layout::
                         row_major ||
@@ -472,11 +444,11 @@ struct joint_matrix_mad_impl<
       N, LayoutC, sycl::sub_group>
   mad(sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::a, M, K,
-          LayoutA, sycl::sub_group, Prec>
+          LayoutA, sycl::sub_group>
           A,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T1, sycl::ext::oneapi::experimental::matrix::matrix_use::b, K, N,
-          LayoutB, sycl::sub_group, Prec>
+          LayoutB, sycl::sub_group>
           B,
       sycl::ext::oneapi::experimental::matrix::joint_matrix<
           T2, sycl::ext::oneapi::experimental::matrix::matrix_use::accumulator,
@@ -545,10 +517,9 @@ struct joint_matrix_mad_impl<
                                      get_layout_pair_id<LayoutA, LayoutB>(), 0);
         }
       }
-    } else if constexpr (M == 16 && N == 16 && K == 8 &&
-                         Prec == sycl::ext::oneapi::experimental::matrix::
-                                     precision::tf32) {
-      __mma_tf32_m16n16k8_mma_f32(D.data, A.data, B.data, C.data,
+    } else if constexpr (M == 16 && N == 16 && K == 8) {
+      __mma_tf32_m16n16k8_mma_f32(D.data, reinterpret_cast<int32_t *>(A.data),
+                                  reinterpret_cast<int32_t *>(B.data), C.data,
                                   get_layout_pair_id<LayoutA, LayoutB>(), 0);
     } else if constexpr (std::is_same<T1, double>::value) {
       __dmma_m8n8k4_mma_f64(D.data, A.data, B.data, C.data,
@@ -562,15 +533,19 @@ struct joint_matrix_mad_impl<
 
 namespace experimental::matrix {
 
-template <typename Group, typename T, matrix_use Use, size_t NumRows,
-          size_t NumCols, matrix_layout Layout, access::address_space Space,
-          precision Prec = precision::standard>
+template <typename Group, typename S, typename T, matrix_use Use,
+          size_t NumRows, size_t NumCols, matrix_layout Layout,
+          access::address_space Space,
+          std::enable_if_t<std::is_same<S, T>::value ||
+                               (std::is_same<S, precision::tf32>::value &&
+                                std::is_same<T, float>::value),
+                           bool> = true>
 void joint_matrix_load(
-    Group sg, joint_matrix<T, Use, NumRows, NumCols, Layout, Group, Prec> &res,
+    Group sg, joint_matrix<S, Use, NumRows, NumCols, Layout, Group> &res,
     multi_ptr<T, Space> src, size_t stride) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  sycl::ext::oneapi::detail::joint_matrix_load_impl<T, Use, NumRows, NumCols,
-                                                    Layout, Space, Prec>{}
+  sycl::ext::oneapi::detail::joint_matrix_load_impl<S, T, Use, NumRows, NumCols,
+                                                    Layout, Space>{}
       .load(res, src, stride);
 #else
   (void)sg;
@@ -608,15 +583,15 @@ void joint_matrix_store(Group sg,
 
 template <typename Group, typename T1, typename T2, std::size_t M,
           std::size_t K, std::size_t N, matrix_layout LayoutA,
-          matrix_layout LayoutB, matrix_layout LayoutC, precision Prec>
+          matrix_layout LayoutB, matrix_layout LayoutC>
 joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group>
 joint_matrix_mad(
-    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group, Prec> A,
-    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group, Prec> B,
+    Group sg, joint_matrix<T1, matrix_use::a, M, K, LayoutA, Group> A,
+    joint_matrix<T1, matrix_use::b, K, N, LayoutB, Group> B,
     joint_matrix<T2, matrix_use::accumulator, M, N, LayoutC, Group> C) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   return sycl::ext::oneapi::detail::joint_matrix_mad_impl<
-             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC, Prec>{}
+             T1, T2, M, K, N, LayoutA, LayoutB, LayoutC>{}
       .mad(A, B, C);
 #else
   (void)sg;
@@ -629,6 +604,24 @@ joint_matrix_mad(
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
+float float_to_tf32(float a) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  int32_t tmp_int = __nvvm_f2tf32_rna(a);
+  return __nvvm_bitcast_i2f(tmp_int);
+#else
+  throw runtime_error("When using SYCL_EXT_ONEAPI_MATRIX=3 float_to_tf32 is "
+                      "only supported by CUDA devices",
+                      PI_INVALID_DEVICE);
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+}
+
+// This function just zeros out the bottom 13 bits of the tf32 type
+float tf32_to_float(float a) {
+  uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
+  tmp_uint &= 0xFFFFE000u;
+  return reinterpret_cast<float &>(tmp_uint);
+}
+
 } // namespace experimental::matrix
 } // namespace oneapi
 } // namespace ext
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index 1e4b1a7f18b22..6099f3a1febd4 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -58,24 +58,32 @@ int main() {
         [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
           sycl::sub_group sg = item.get_sub_group();
 
-          joint_matrix<float, matrix_use::a, M, K, matrix_layout::row_major,
-                       sycl::sub_group, precision::tf32>
+          joint_matrix<precision::tf32, matrix_use::a, M, K,
+                       matrix_layout::row_major>
               sub_a;
 
-          joint_matrix<float, matrix_use::b, K, N, matrix_layout::row_major,
-                       sycl::sub_group, precision::tf32>
+          joint_matrix<precision::tf32, matrix_use::b, K, N,
+                       matrix_layout::row_major>
               sub_b;
 
           joint_matrix<float, matrix_use::accumulator, M, N,
                        matrix_layout::row_major>
               sub_c;
 
-          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_accC, i32 16) #{{.*}}
-          joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
           joint_matrix_load(sg, sub_a, accA.get_pointer(), K);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_accC, i32 16) #{{.*}}
+          joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
+
+          // Round a, b to tf32
+          for (auto i = 0; i < 4; ++i)
+            sub_a.data[i] = float_to_tf32(sub_a.data[i]);
+
+          for (auto i = 0; i < 4; ++i)
+            sub_b.data[i] = float_to_tf32(sub_b.data[i]);
+
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
           //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 {{.*}}
@@ -94,24 +102,32 @@ int main() {
         [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
           sycl::sub_group sg = item.get_sub_group();
 
-          joint_matrix<float, matrix_use::a, M, K, matrix_layout::col_major,
-                       sycl::sub_group, precision::tf32>
+          joint_matrix<precision::tf32, matrix_use::a, M, K,
+                       matrix_layout::col_major>
               sub_a;
 
-          joint_matrix<float, matrix_use::b, K, N, matrix_layout::col_major,
-                       sycl::sub_group, precision::tf32>
+          joint_matrix<precision::tf32, matrix_use::b, K, N,
+                       matrix_layout::col_major>
               sub_b;
 
           joint_matrix<float, matrix_use::accumulator, M, N,
                        matrix_layout::col_major>
               sub_c;
 
-          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, i32 {{.*}}) #{{.*}}
-          joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 8) #{{.*}}
           joint_matrix_load(sg, sub_a, accA.get_pointer(), K);
           //CHECK: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0i32(i32* %call.ascast.i.i{{.*}}.i, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
+          //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, i32 {{.*}}) #{{.*}}
+          joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
+          
+          // Round a, b to tf32
+          for (auto i = 0; i < 4; ++i)
+            sub_a.data[i] = float_to_tf32(sub_a.data[i]);
+
+          for (auto i = 0; i < 4; ++i)
+            sub_b.data[i] = float_to_tf32(sub_b.data[i]);
+          
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
           //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16) #{{.*}}

From 618c80750930b0eaec8cde468c880d52ba54c80c Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 31 Mar 2022 16:12:49 +0100
Subject: [PATCH 21/26] Add device code check for conversion builtin

---
 .../sycl/ext/oneapi/matrix/matrix-tensorcore.hpp       | 10 ++++++----
 .../matrix/matrix-nvptx-tf32-test.cpp                  |  6 ++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index 5581a46ab27d9..e02b9208e5781 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -609,9 +609,10 @@ float float_to_tf32(float a) {
   int32_t tmp_int = __nvvm_f2tf32_rna(a);
   return __nvvm_bitcast_i2f(tmp_int);
 #else
-  throw runtime_error("When using SYCL_EXT_ONEAPI_MATRIX=3 float_to_tf32 is "
-                      "only supported by CUDA devices",
-                      PI_INVALID_DEVICE);
+  uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
+  tmp_uint += 0x1000u;
+  float ret = reinterpret_cast<float &>(tmp_uint);
+  return ret;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
@@ -619,7 +620,8 @@ float float_to_tf32(float a) {
 float tf32_to_float(float a) {
   uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
   tmp_uint &= 0xFFFFE000u;
-  return reinterpret_cast<float &>(tmp_uint);
+  float ret = reinterpret_cast<float &>(tmp_uint);
+  return ret;
 }
 
 } // namespace experimental::matrix
diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index 6099f3a1febd4..c2ac099a0c59a 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -77,6 +77,7 @@ int main() {
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1f32(float addrspace(1)* %_arg_accC, i32 16) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
 
+          // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
           // Round a, b to tf32
           for (auto i = 0; i < 4; ++i)
             sub_a.data[i] = float_to_tf32(sub_a.data[i]);
@@ -120,14 +121,15 @@ int main() {
           joint_matrix_load(sg, sub_b, accB.get_pointer(), N);
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, i32 {{.*}}) #{{.*}}
           joint_matrix_load(sg, sub_c, accC.get_pointer(), N);
-          
+
+          // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
           // Round a, b to tf32
           for (auto i = 0; i < 4; ++i)
             sub_a.data[i] = float_to_tf32(sub_a.data[i]);
 
           for (auto i = 0; i < 4; ++i)
             sub_b.data[i] = float_to_tf32(sub_b.data[i]);
-          
+
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
           //CHECK: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1f32(float addrspace(1)* {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16) #{{.*}}

From b03e6619969f4571c7b1481f09be3eed2667a15a Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 14 Apr 2022 10:36:13 +0100
Subject: [PATCH 22/26] Zeroing out bottom bits in sware impl

---
 sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index e02b9208e5781..1de0a03da463e 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -611,6 +611,7 @@ float float_to_tf32(float a) {
 #else
   uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
   tmp_uint += 0x1000u;
+  tmp_uint &= 0xFFFFE000u;
   float ret = reinterpret_cast<float &>(tmp_uint);
   return ret;
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)

From 077e0f4ec95cace19df2d47424728ba1df88f62d Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 14 Apr 2022 10:44:07 +0100
Subject: [PATCH 23/26] Changing names

---
 sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index 1de0a03da463e..b819d2b167077 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -604,7 +604,9 @@ joint_matrix_mad(
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
-float float_to_tf32(float a) {
+// This function rounds the bottom 13 bits up or down, and then zeros out the
+// bottom bits
+float round_to_tf32(float a) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
   int32_t tmp_int = __nvvm_f2tf32_rna(a);
   return __nvvm_bitcast_i2f(tmp_int);
@@ -618,7 +620,7 @@ float float_to_tf32(float a) {
 }
 
 // This function just zeros out the bottom 13 bits of the tf32 type
-float tf32_to_float(float a) {
+float truncate_to_tf32(float a) {
   uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
   tmp_uint &= 0xFFFFE000u;
   float ret = reinterpret_cast<float &>(tmp_uint);

From 5b5bbcc7feb65c729c2c2aa20886af26c8e6b3d3 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 19 Apr 2022 09:21:04 +0100
Subject: [PATCH 24/26] Removing newline

---
 .../extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
index 6915c9c9e2d81..88b6c73b02514 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
@@ -57,7 +57,6 @@ floating-point type(`float`) to `bfloat16` type and vice versa. The extension
 doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
 type(`uint16_t`) as a storage for `bfloat16` values.
 
-
 The purpose of conversion from float to bfloat16 is to reduce the amount of memory
 required to store floating-point numbers. Computations are expected to be done with
 32-bit floating-point values.

From 560b02d2635252bcfb7369594fc97228445a24ff Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 21 Apr 2022 09:12:13 +0100
Subject: [PATCH 25/26] Removing truncate function

---
 sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
index b819d2b167077..4aa9ff0effc4a 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-tensorcore.hpp
@@ -619,14 +619,6 @@ float round_to_tf32(float a) {
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
 }
 
-// This function just zeros out the bottom 13 bits of the tf32 type
-float truncate_to_tf32(float a) {
-  uint32_t tmp_uint = reinterpret_cast<uint32_t &>(a);
-  tmp_uint &= 0xFFFFE000u;
-  float ret = reinterpret_cast<float &>(tmp_uint);
-  return ret;
-}
-
 } // namespace experimental::matrix
 } // namespace oneapi
 } // namespace ext

From 13b1efbb3c67aada11b8a3db1723bbd3e068bfc4 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Fri, 22 Apr 2022 10:50:05 +0100
Subject: [PATCH 26/26] Updating test

---
 .../check_device_code/matrix/matrix-nvptx-tf32-test.cpp   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
index c2ac099a0c59a..9cdd5e739b00a 100644
--- a/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-nvptx-tf32-test.cpp
@@ -80,10 +80,10 @@ int main() {
           // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
           // Round a, b to tf32
           for (auto i = 0; i < 4; ++i)
-            sub_a.data[i] = float_to_tf32(sub_a.data[i]);
+            sub_a.data[i] = round_to_tf32(sub_a.data[i]);
 
           for (auto i = 0; i < 4; ++i)
-            sub_b.data[i] = float_to_tf32(sub_b.data[i]);
+            sub_b.data[i] = round_to_tf32(sub_b.data[i]);
 
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.row.row.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 %{{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);
@@ -125,10 +125,10 @@ int main() {
           // CHECK: tail call i32 @llvm.nvvm.f2tf32.rna(float {{.*}}
           // Round a, b to tf32
           for (auto i = 0; i < 4; ++i)
-            sub_a.data[i] = float_to_tf32(sub_a.data[i]);
+            sub_a.data[i] = round_to_tf32(sub_a.data[i]);
 
           for (auto i = 0; i < 4; ++i)
-            sub_b.data[i] = float_to_tf32(sub_b.data[i]);
+            sub_b.data[i] = round_to_tf32(sub_b.data[i]);
 
           //CHECK: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k8.mma.col.col.tf32(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}) #{{.*}}
           sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c);