[BACKEND] Implement BF16x3 trick

plotfi · plotfi · commit 8132a2e2c07f · 2025-09-26T16:04:05.000-07:00
Implements emulation of a 32-bit floating point dot operation using 3 BF16s. This is based on https://arxiv.org/abs/1904.06376 and works because the mantisa of 3 BF16s add up to the mantisa of a fp32. Storing 1 fp32 in 3 bf16s: ``` def BF16(v): return v.to(torch.bfloat16) def FP32(v): return v.to(torch.float32) def BF16x3(v): b0 = BF16(original) b1 = BF16(original - FP32(b0)) b2 = BF16(original - FP32(b0) - FP32(b1)) return (b0, b1, b2) original = torch.rand(1, 1, dtype=torch.float32) bf16x3 = BF16x3(original) ``` Emulating multiplication of two fp32s: ``` def mul_bf16x3(a, b, c): a0, a1, a2 = BF16x3(a) b0, b1, b2 = BF16x3(b) c = c + (a0 * b0) # low low c = c + (a1 * b0) # mid low c = c + (a0 * b1) # low mid c = c + (a1 * b1) # mid mid c = c + (a0 * b2) # low hi c = c + (a2 * b0) # hi low c = c + (a1 * b2) # mid hi c = c + (a2 * b1) # hi mid c = c + (a2 * b2) # hi hi return c a = torch.rand(1, 1, dtype=torch.float32) b = torch.rand(1, 1, dtype=torch.float32) c = torch.zeros(1, 1, dtype=torch.float32) # accumulator result = mul_bf16x3(a, b, c) ``` The emulation using BF16x3 is used when invoking tl.dot with input precision 'BF16x3'. This pass is implemented in a GPU agnostic manner, but it is needed support for MI350's lack of TF32 support. This part is a work in progress but will be based on this patch.
diff --git a/03-matrix-multiplication.py b/03-matrix-multiplication.py
@@ -0,0 +1,160 @@
+import torch
+
+import triton
+import triton.language as tl
+
+DEVICE = triton.runtime.driver.active.get_active_torch_device()
+
+def is_cuda():
+    return triton.runtime.driver.active.get_current_target().backend == "cuda"
+
+def get_cuda_autotune_config():
+    return [
+        triton.Config({'BLOCK_SIZE_M': 32,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5,num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5,num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3,num_warps=8),
+    ]
+
+def get_hip_autotune_config():
+    sizes = [
+        {'BLOCK_SIZE_M': 32,  'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 6},
+        {'BLOCK_SIZE_M': 32,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 6},
+        {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 32,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4},
+        {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 6},
+        {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4},
+        {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4},
+        {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4},
+        {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 6},
+    ]
+    return [triton.Config(s | {'matrix_instr_nonkdim': 16}, num_warps=8, num_stages=2) for s in sizes]
+
+def get_autotune_config():
+    if is_cuda():
+        return get_cuda_autotune_config()
+    else:
+        return get_hip_autotune_config()
+
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def matmul_kernel(
+        a_ptr, b_ptr, c_ptr,
+        M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        stride_cm, stride_cn,
+        BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        PRECISION: tl.constexpr
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    tl.assume(pid_m >= 0)
+    tl.assume(pid_n >= 0)
+    tl.assume(stride_am > 0)
+    tl.assume(stride_ak > 0)
+    tl.assume(stride_bn > 0)
+    tl.assume(stride_bk > 0)
+    tl.assume(stride_cm > 0)
+    tl.assume(stride_cn > 0)
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        accumulator += tl.dot(a, b, input_precision=PRECISION)
+        # accumulator = tl.dot(a, b, accumulator)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+def matmul(a, b, precision="ieee"):
+    M, K = a.shape
+    K, N = b.shape
+    c = torch.empty((M, N), device=a.device, dtype=torch.float32)
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
+    matmul_kernel[grid](
+        a, b, c,
+        M, N, K,
+        a.stride(0), a.stride(1),
+        b.stride(0), b.stride(1),
+        c.stride(0), c.stride(1),
+        PRECISION=precision
+    )
+    return c
+
+
+precisions = ["ieee", "bf16", "bf16x3", "bf16x6", "bf16x9"]
+torch.manual_seed(0)
+
+for precision in precisions:
+    a = torch.rand((512, 512), device=DEVICE, dtype=torch.float32) - 0.5
+    b = torch.rand((512, 512), device=DEVICE, dtype=torch.float32) - 0.5
+    triton_output = matmul(a, b, precision=precision)
+    torch_output = torch.matmul(a, b)
+    #print(f"triton_output_with_fp32_inputs={triton_output}")
+    #print(f"torch_output_with_fp32_inputs={torch_output}")
+
+    if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0):
+        print(f'✅ Triton and Torch match for input_precision={precision}')
+    else:
+        print(f'❌ Triton and Torch differ for input_precision={precision}')
+
+ref_lib = 'cuBLAS' if is_cuda() else 'rocBLAS'
+
+configs = []
+configs.append(
+    triton.testing.Benchmark(
+        x_names=["M", "N", "K"],
+        x_vals=[128 * i for i in range(2, 33)],
+        line_arg="provider",
+        line_vals=[ref_lib.lower(), "triton-ieee", "triton-bf16", "triton-bf16x3", "triton-bf16x6", "triton-bf16x9"],
+        line_names=[ref_lib, "Triton-IEEE", "Triton-BF16", "Triton-BF16x3", "Triton-BF16x6", "Triton-BF16x9"],
+        styles=[("green", "-"), ("blue", "-"), ("blue", "-"), ("blue", "-"), ("blue", "-"), ("blue", "-")],
+        ylabel="TFLOPS",
+        plot_name="matmul-performance-f32",
+        args={},
+    ))
+
+@triton.testing.perf_report(configs)
+def benchmark(M, N, K, provider):
+    a = torch.randn((M, K), device=DEVICE, dtype=torch.float32)
+    b = torch.randn((K, N), device=DEVICE, dtype=torch.float32)
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == ref_lib.lower():
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), quantiles=quantiles)
+    if provider.startswith('triton-'):
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b, provider.removeprefix('triton-')), quantiles=quantiles)
+    perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
+    return perf(ms), perf(max_ms), perf(min_ms)
+
+benchmark.run(show_plots=False, print_data=True)
diff --git a/include/triton/Dialect/Triton/IR/TritonAttrDefs.td b/include/triton/Dialect/Triton/IR/TritonAttrDefs.td
@@ -129,7 +129,11 @@ def TT_InputPrecisionAttr : I32EnumAttr<
     [
       I32EnumAttrCase<"TF32", 0, "tf32">,
       I32EnumAttrCase<"TF32x3", 1, "tf32x3">,
-      I32EnumAttrCase<"IEEE", 2, "ieee">
+      I32EnumAttrCase<"IEEE", 2, "ieee">,
+      I32EnumAttrCase<"BF16", 3, "bf16">,
+      I32EnumAttrCase<"BF16x3", 4, "bf16x3">,
+      I32EnumAttrCase<"BF16x6", 5, "bf16x6">,
+      I32EnumAttrCase<"BF16x9", 6, "bf16x9">
     ]>{
   let cppNamespace = "::mlir::triton";
 }
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Passes.td b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -188,6 +188,17 @@ def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
                            "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"];
 }
 
+def TritonGPUBF16DotTC : Pass<"tritongpu-BF16DotTC", "mlir::ModuleOp"> {
+  let summary = "3xBF16 trick";
+
+  let description = [{
+    Decompose fp32 `DotOp` instructions into BF16 operations.
+    See https://arxiv.org/abs/1904.06376
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+}
+
 def TritonGPUPrefetch : Pass<"tritongpu-prefetch", "mlir::ModuleOp"> {
   let summary = "prefetch";
 
diff --git a/lib/Dialect/TritonGPU/Transforms/BF16DotTC.cpp b/lib/Dialect/TritonGPU/Transforms/BF16DotTC.cpp
@@ -0,0 +1,157 @@
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+namespace triton {
+namespace gpu {
+
+#define GEN_PASS_DEF_TRITONGPUBF16DOTTC
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
+
+namespace {
+
+// Implement 3xBF16 https://arxiv.org/abs/1904.06376
+class BF16x3 : public OpRewritePattern<DotOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DotOp dotOp,
+                                PatternRewriter &rewriter) const override {
+    switch (dotOp.getInputPrecision()) {
+      case InputPrecision::BF16:
+      case InputPrecision::BF16x3:
+      case InputPrecision::BF16x6:
+      case InputPrecision::BF16x9:
+        break;
+      default:
+        return failure();
+    }
+
+    auto isF32 = [](Value operand) {
+      return cast<RankedTensorType>(operand.getType()).getElementType().isF32();
+    };
+    if (!isF32(dotOp.getA()) || !isF32(dotOp.getB())) {
+      return failure();
+    }
+
+    // Aux functions
+    auto f32ToBF16 = [&](Value value) -> Value {
+      auto fp32Type = cast<RankedTensorType>(value.getType());
+      auto bf16Type =
+          RankedTensorType::get(fp32Type.getShape(), rewriter.getBF16Type(), fp32Type.getEncoding());
+      return rewriter.create<arith::TruncFOp>(dotOp.getLoc(), bf16Type, value)
+          .getResult();
+    };
+    auto bf16ToF32 = [&](Value value) -> Value {
+      auto bf16Type = cast<RankedTensorType>(value.getType());
+      auto fp32Type =
+          RankedTensorType::get(bf16Type.getShape(), rewriter.getF32Type(), bf16Type.getEncoding());
+      return rewriter.create<arith::ExtFOp>(dotOp.getLoc(), fp32Type, value)
+          .getResult();
+    };
+    auto zeroLike = [&](Value c) -> Value {
+      return rewriter.create<SplatOp>(
+          dotOp->getLoc(), c.getType(),
+          rewriter.create<arith::ConstantOp>(dotOp->getLoc(),
+                                             rewriter.getF32FloatAttr(0)));
+    };
+    auto add = [&](Value a, Value b) -> Value {
+      return rewriter.create<arith::AddFOp>(dotOp.getLoc(), a, b);
+    };
+    auto sub = [&](Value a, Value b) -> Value {
+      return rewriter.create<arith::SubFOp>(dotOp.getLoc(), a, b);
+    };
+    auto dot = [&](Value a, Value b, Value c) -> Value {
+      return rewriter.create<DotOp>(dotOp->getLoc(), c.getType(), a, b, c,
+                                    InputPrecision::BF16,
+                                    dotOp.getMaxNumImpreciseAcc());
+    };
+    auto replaceNansWithZeros = [&](Value value) -> Value {
+      auto nans = rewriter.create<arith::CmpFOp>(
+          dotOp->getLoc(), arith::CmpFPredicate::UNO, value, value);
+      auto zero = zeroLike(value);
+      return rewriter.create<arith::SelectOp>(dotOp->getLoc(), nans, zero,
+                                              value);
+    };
+
+    auto SplitF32 = [&](Value input, unsigned N) -> std::vector<Value> {
+      std::vector<Value> split_inputs;
+      split_inputs.reserve(N);
+      for (int i = 0; i < N; ++i) {
+        Value input_as_bf16 = f32ToBF16(input);
+        if (i != N - 1) {
+          Value input_as_f32 = bf16ToF32(input_as_bf16);
+          input = rewriter.create<arith::SubFOp>(dotOp->getLoc(), input,
+                                                 input_as_f32);
+        }
+        split_inputs.push_back(input_as_bf16);
+      }
+      return split_inputs;
+    };
+
+    const int hi = 0;
+    const int med = 1;
+    const int lo = 2;
+
+    const unsigned N = 3;
+    auto lhs_parts = SplitF32(dotOp.getA(), N);
+    auto rhs_parts = SplitF32(dotOp.getB(), N);
+
+    auto result = zeroLike(dotOp.getC());
+
+    if (dotOp.getInputPrecision() == InputPrecision::BF16x9) {
+      result = dot(lhs_parts[lo], rhs_parts[lo], result);
+      result = dot(lhs_parts[med], rhs_parts[lo], result);
+      result = dot(lhs_parts[lo], rhs_parts[med], result);
+
+      result = dot(lhs_parts[med], rhs_parts[med], result);
+
+      result = dot(lhs_parts[lo], rhs_parts[hi], result);
+      result = dot(lhs_parts[hi], rhs_parts[lo], result);
+
+      result = dot(lhs_parts[med], rhs_parts[hi], result);
+      result = dot(lhs_parts[hi], rhs_parts[med], result);
+
+    } else if (dotOp.getInputPrecision() == InputPrecision::BF16x6) {
+      result = dot(lhs_parts[med], rhs_parts[med], result);
+
+      result = dot(lhs_parts[lo], rhs_parts[hi], result);
+      result = dot(lhs_parts[hi], rhs_parts[lo], result);
+
+      result = dot(lhs_parts[med], rhs_parts[hi], result);
+      result = dot(lhs_parts[hi], rhs_parts[med], result);
+
+    } else if (dotOp.getInputPrecision() == InputPrecision::BF16x3) {
+      result = dot(lhs_parts[med], rhs_parts[hi], result);
+      result = dot(lhs_parts[hi], rhs_parts[med], result);
+    }
+
+    result = replaceNansWithZeros(result);
+    result = dot(lhs_parts[hi], rhs_parts[hi], result);
+    result = add(result, dotOp.getC());
+
+    rewriter.replaceOp(dotOp, result);
+    return success();
+  }
+};
+
+} // anonymous namespace
+
+struct BF16DotTCPass : public impl::TritonGPUBF16DotTCBase<BF16DotTCPass> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    ModuleOp m = getOperation();
+
+    RewritePatternSet decomposePatterns(context);
+    decomposePatterns.add<BF16x3>(context);
+    if (applyPatternsGreedily(m, std::move(decomposePatterns)).failed()) {
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace gpu
+} // namespace triton
+} // namespace mlir
diff --git a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_triton_library(TritonGPUTransforms
   AccelerateMatmul.cpp
   Coalesce.cpp
   F32DotTC.cpp
+  BF16DotTC.cpp
   FuseNestedLoops.cpp
   CombineTensorSelectAndIf.cpp
   DecomposeScaledBlocked.cpp
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -308,6 +308,10 @@ void init_triton_ir(py::module &&m) {
       .value("TF32", InputPrecision::TF32)
       .value("TF32x3", InputPrecision::TF32x3)
       .value("IEEE", InputPrecision::IEEE)
+      .value("BF16", InputPrecision::BF16)
+      .value("BF16x3", InputPrecision::BF16x3)
+      .value("BF16x6", InputPrecision::BF16x6)
+      .value("BF16x9", InputPrecision::BF16x9)
       .export_values();
 
   py::enum_<ScaleDotElemType>(m, "ScaleDotElemTypeTY", py::module_local())
diff --git a/python/src/passes.cc b/python/src/passes.cc
@@ -71,6 +71,7 @@ void init_triton_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_accelerate_matmul", createTritonGPUAccelerateMatmul);
   ADD_PASS_WRAPPER_0("add_reorder_instructions",
                      createTritonGPUReorderInstructions);
+  ADD_PASS_WRAPPER_0("add_bf16_dot_tc", createTritonGPUBF16DotTC);
   ADD_PASS_WRAPPER_0("add_f32_dot_tc", createTritonGPUF32DotTC);
   ADD_PASS_OPTION_WRAPPER_1("add_optimize_dot_operands",
                             createTritonGPUOptimizeDotOperands, bool);
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3083,7 +3083,7 @@ def get_test_dot_base_cases():
     return [(*shape, 4, False, False, epilogue, input_precision, in_dtype, out_dtype, 1, None)
             for shape in [(64, 64, 64), (32, 32, 32), (16, 16, 16)]
             for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
-            for input_precision in ['tf32', 'tf32x3', 'ieee']
+            for input_precision in ['tf32', 'tf32x3', 'ieee', 'bf16', 'bf16x3']
             for in_dtype, out_dtype in [('float16', 'float16'), ('float16',
                                                                  'float32'), ('float32',
                                                                               'float32'), ('float64', 'float64')]
@@ -3237,7 +3237,8 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dty
                 pytest.skip(f"{in_dtype} only supported on CDNA4 and gfx12")
             if in_dtype in ("float8e5b16", "float8e4b8") and not is_hip_cdna3():
                 pytest.skip(f"{in_dtype} only supported on CDNA3")
-            if not ((input_precision == "ieee") or (input_precision == "tf32" and is_hip_cdna3())):
+            if not ((input_precision == "bf16x3") or (input_precision == "ieee") or
+                    (input_precision == "tf32" and is_hip_cdna3())):
                 pytest.skip(f"{input_precision} not supported on HIP")
             if kpack == 2 and in_dtype == 'int8' and K < 64:
                 pytest.skip("kpack too large for K")
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
diff --git a/test/TritonGPU/bf16x3-matmul.mlir b/test/TritonGPU/bf16x3-matmul.mlir
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,11 @@ def TT_InputPrecisionAttr : I32EnumAttr<`
`129`	`129`	`[`
`130`	`130`	`I32EnumAttrCase<"TF32", 0, "tf32">,`
`131`	`131`	`I32EnumAttrCase<"TF32x3", 1, "tf32x3">,`
`132`		`- I32EnumAttrCase<"IEEE", 2, "ieee">`
	`132`	`+ I32EnumAttrCase<"IEEE", 2, "ieee">,`
	`133`	`+ I32EnumAttrCase<"BF16", 3, "bf16">,`
	`134`	`+ I32EnumAttrCase<"BF16x3", 4, "bf16x3">,`
	`135`	`+ I32EnumAttrCase<"BF16x6", 5, "bf16x6">,`
	`136`	`+ I32EnumAttrCase<"BF16x9", 6, "bf16x9">`
`133`	`137`	`]>{`
`134`	`138`	`let cppNamespace = "::mlir::triton";`
`135`	`139`	`}`