[BACKEND] Implement BF16x3 trick

plotfi · plotfi · commit 591d437c987c · 2025-07-22T12:54:06.000-07:00
Implements emulation of a 32-bit floating point dot operation using 3 BF16s. This is based on https://arxiv.org/abs/1904.06376 and works because the mantisa of 3 BF16s add up to the mantisa of a fp32. Storing 1 fp32 in 3 bf16s: ``` def BF16(v): return v.to(torch.bfloat16) def FP32(v): return v.to(torch.float32) def BF16x3(v): b0 = BF16(original) b1 = BF16(original - FP32(b0)) b2 = BF16(original - FP32(b0) - FP32(b1)) return (b0, b1, b2) original = torch.rand(1, 1, dtype=torch.float32) bf16x3 = BF16x3(original) ``` Emulating multiplication of two fp32s: ``` def mul_bf16x3(a, b, c): a0, a1, a2 = BF16x3(a) b0, b1, b2 = BF16x3(b) c = c + (a0 * b0) # low low c = c + (a1 * b0) # mid low c = c + (a0 * b1) # low mid c = c + (a1 * b1) # mid mid c = c + (a0 * b2) # low hi c = c + (a2 * b0) # hi low c = c + (a1 * b2) # mid hi c = c + (a2 * b1) # hi mid c = c + (a2 * b2) # hi hi return c a = torch.rand(1, 1, dtype=torch.float32) b = torch.rand(1, 1, dtype=torch.float32) c = torch.zeros(1, 1, dtype=torch.float32) # accumulator result = mul_bf16x3(a, b, c) ``` The emulation using BF16x3 is used when invoking tl.dot with input precision 'BF16x3'. This pass is implemented in a GPU agnostic manner, but it is needed support for MI350's lack of TF32 support. This part is a work in progress but will be based on this patch.
diff --git a/include/triton/Dialect/Triton/IR/TritonAttrDefs.td b/include/triton/Dialect/Triton/IR/TritonAttrDefs.td
@@ -129,7 +129,9 @@ def TT_InputPrecisionAttr : I32EnumAttr<
     [
       I32EnumAttrCase<"TF32", 0, "tf32">,
       I32EnumAttrCase<"TF32x3", 1, "tf32x3">,
-      I32EnumAttrCase<"IEEE", 2, "ieee">
+      I32EnumAttrCase<"IEEE", 2, "ieee">,
+      I32EnumAttrCase<"BF16", 3, "bf16">,
+      I32EnumAttrCase<"BF16x3", 4, "bf16x3">
     ]>{
   let cppNamespace = "::mlir::triton";
 }
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Passes.td b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -201,6 +201,17 @@ def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
                            "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"];
 }
 
+def TritonGPUBF16x3Dot : Pass<"tritongpu-BF16x3Dot", "mlir::ModuleOp"> {
+  let summary = "3xBF16 trick";
+
+  let description = [{
+    Decompose fp32 `DotOp` instructions into BF16 operations.
+    See https://arxiv.org/abs/1904.06376
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+}
+
 def TritonGPUPrefetch : Pass<"tritongpu-prefetch", "mlir::ModuleOp"> {
   let summary = "prefetch";
 
diff --git a/lib/Dialect/TritonGPU/Transforms/BF16x3Dot.cpp b/lib/Dialect/TritonGPU/Transforms/BF16x3Dot.cpp
@@ -0,0 +1,135 @@
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace mlir {
+namespace triton {
+namespace gpu {
+
+#define GEN_PASS_DEF_TRITONGPUBF16X3DOT
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
+
+namespace {
+
+// Implement 3xBF16 https://arxiv.org/abs/1904.06376
+class BF16x3 : public OpRewritePattern<DotOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DotOp dotOp,
+                                PatternRewriter &rewriter) const override {
+
+    auto isBF16x3Candidate = [](Value operand) {
+      return cast<RankedTensorType>(operand.getType()).getElementType().isF32();
+    };
+
+    if (!(dotOp.getInputPrecision() == InputPrecision::BF16x3 &&
+          isBF16x3Candidate(dotOp.getA()) && isBF16x3Candidate(dotOp.getB()))) {
+      return failure();
+    }
+
+    // Aux functions
+    auto f32ToBF16 = [&](Value value) -> Value {
+      auto fp32Type = cast<RankedTensorType>(value.getType());
+      auto bf16Type =
+          RankedTensorType::get(fp32Type.getShape(), rewriter.getBF16Type());
+      return rewriter.create<arith::TruncFOp>(dotOp.getLoc(), bf16Type, value)
+          .getResult();
+    };
+    auto bf16ToF32 = [&](Value value) -> Value {
+      auto bf16Type = cast<RankedTensorType>(value.getType());
+      auto fp32Type =
+          RankedTensorType::get(bf16Type.getShape(), rewriter.getF32Type());
+      return rewriter.create<arith::ExtFOp>(dotOp.getLoc(), fp32Type, value)
+          .getResult();
+    };
+    auto zeroLike = [&](Value c) -> Value {
+      return rewriter.create<SplatOp>(
+          dotOp->getLoc(), c.getType(),
+          rewriter.create<arith::ConstantOp>(dotOp->getLoc(),
+                                             rewriter.getF32FloatAttr(0)));
+    };
+    auto add = [&](Value a, Value b) -> Value {
+      return rewriter.create<arith::AddFOp>(dotOp.getLoc(), a, b);
+    };
+    auto sub = [&](Value a, Value b) -> Value {
+      return rewriter.create<arith::SubFOp>(dotOp.getLoc(), a, b);
+    };
+    auto dot = [&](Value a, Value b, Value c) -> Value {
+      return rewriter.create<DotOp>(dotOp->getLoc(), c.getType(), a, b, c,
+                                    InputPrecision::BF16,
+                                    dotOp.getMaxNumImpreciseAcc());
+    };
+    auto replaceNansWithZeros = [&](Value value) -> Value {
+      auto nans = rewriter.create<arith::CmpFOp>(
+          dotOp->getLoc(), arith::CmpFPredicate::UNO, value, value);
+      auto zero = zeroLike(value);
+      return rewriter.create<arith::SelectOp>(dotOp->getLoc(), nans, zero,
+                                              value);
+    };
+
+    auto SplitF32 = [&](Value input, unsigned N) -> std::vector<Value> {
+      std::vector<Value> split_inputs;
+      split_inputs.reserve(N);
+      for (int i = 0; i < N; ++i) {
+        Value input_as_bf16 = f32ToBF16(input);
+        if (i != N - 1) {
+          Value input_as_f32 = bf16ToF32(input_as_bf16);
+          input = rewriter.create<arith::SubFOp>(dotOp->getLoc(), input,
+                                                 input_as_f32);
+        }
+        split_inputs.push_back(input_as_bf16);
+      }
+      return split_inputs;
+    };
+
+    const int hi = 0;
+    const int med = 1;
+    const int lo = 2;
+
+    const unsigned N = 3;
+    auto lhs_parts = SplitF32(dotOp.getA(), N);
+    auto rhs_parts = SplitF32(dotOp.getB(), N);
+
+    auto result = zeroLike(dotOp.getC());
+
+    result = dot(lhs_parts[lo], rhs_parts[lo], result);
+    result = dot(lhs_parts[med], rhs_parts[lo], result);
+    result = dot(lhs_parts[lo], rhs_parts[med], result);
+
+    result = dot(lhs_parts[med], rhs_parts[med], result);
+
+    result = dot(lhs_parts[lo], rhs_parts[hi], result);
+    result = dot(lhs_parts[hi], rhs_parts[lo], result);
+
+    result = dot(lhs_parts[med], rhs_parts[hi], result);
+    result = dot(lhs_parts[hi], rhs_parts[med], result);
+
+    result = replaceNansWithZeros(result);
+    result = dot(lhs_parts[hi], rhs_parts[hi], result);
+    result = add(result, dotOp.getC());
+
+    rewriter.replaceOp(dotOp, result);
+    return success();
+  }
+};
+
+} // anonymous namespace
+
+struct BF16x3DotPass : public impl::TritonGPUBF16x3DotBase<BF16x3DotPass> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    ModuleOp m = getOperation();
+
+    RewritePatternSet decomposePatterns(context);
+    decomposePatterns.add<BF16x3>(context);
+    if (applyPatternsGreedily(m, std::move(decomposePatterns)).failed()) {
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace gpu
+} // namespace triton
+} // namespace mlir
diff --git a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_triton_library(TritonGPUTransforms
   AccelerateMatmul.cpp
   Coalesce.cpp
   F32DotTC.cpp
+  BF16x3Dot.cpp
   FuseNestedLoops.cpp
   CombineTensorSelectAndIf.cpp
   DecomposeScaledBlocked.cpp
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -272,6 +272,8 @@ void init_triton_ir(py::module &&m) {
       .value("TF32", InputPrecision::TF32)
       .value("TF32x3", InputPrecision::TF32x3)
       .value("IEEE", InputPrecision::IEEE)
+      .value("BF16", InputPrecision::BF16)
+      .value("BF16x3", InputPrecision::BF16x3)
       .export_values();
 
   py::enum_<ScaleDotElemType>(m, "ScaleDotElemTypeTY", py::module_local())
diff --git a/python/src/passes.cc b/python/src/passes.cc
@@ -70,6 +70,7 @@ void init_triton_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_accelerate_matmul", createTritonGPUAccelerateMatmul);
   ADD_PASS_WRAPPER_0("add_reorder_instructions",
                      createTritonGPUReorderInstructions);
+  ADD_PASS_WRAPPER_0("add_bf16x3_dot", createTritonGPUBF16x3Dot);
   ADD_PASS_WRAPPER_0("add_f32_dot_tc", createTritonGPUF32DotTC);
   ADD_PASS_OPTION_WRAPPER_1("add_optimize_dot_operands",
                             createTritonGPUOptimizeDotOperands, bool);
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3761,7 +3761,7 @@ def get_test_dot_base_cases():
     return [(*shape, 4, False, False, epilogue, input_precision, in_dtype, out_dtype, 1, None)
             for shape in [(64, 64, 64), (32, 32, 32), (16, 16, 16)]
             for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
-            for input_precision in ['tf32', 'tf32x3', 'ieee']
+            for input_precision in ['tf32', 'tf32x3', 'ieee', 'bf16', 'bf16x3']
             for in_dtype, out_dtype in [('float16', 'float16'), ('float16',
                                                                  'float32'), ('float32',
                                                                               'float32'), ('float64', 'float64')]
@@ -3915,7 +3915,8 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dty
                 pytest.skip(f"{in_dtype} only supported on CDNA4 and gfx12")
             if in_dtype in ("float8e5b16", "float8e4b8") and not is_hip_cdna3():
                 pytest.skip(f"{in_dtype} only supported on CDNA3")
-            if not ((input_precision == "ieee") or (input_precision == "tf32" and is_hip_cdna3())):
+            if not ((input_precision == "bf16x3") or (input_precision == "ieee") or
+                    (input_precision == "tf32" and is_hip_cdna3())):
                 pytest.skip(f"{input_precision} not supported on HIP")
             if kpack == 2 and in_dtype == 'int8' and K < 64:
                 pytest.skip("kpack too large for K")
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1465,6 +1465,8 @@ def _str_to_dot_input_precision(self, input_precision):
         input_precision = input_precision.upper()
         if input_precision == "TF32X3":
             input_precision = "TF32x3"
+        if input_precision == "BF16X3":
+            input_precision = "BF16x3"
         return getattr(ir.INPUT_PRECISION, input_precision)
 
     def dot(self, lhs: TensorTy, rhs: TensorTy, acc: TensorTy, input_precision: Optional[str],
diff --git a/test/TritonGPU/bf16x3-matmul.mlir b/test/TritonGPU/bf16x3-matmul.mlir
@@ -0,0 +1,39 @@
+// RUN: triton-opt %s -tritongpu-BF16x3Dot -canonicalize | FileCheck %s --check-prefixes=CHECK
+
+// CHECK: %[[lhs_hi:.*]] = arith.truncf %arg0
+// CHECK-NEXT: %[[val1:.*]]    = arith.extf %[[lhs_hi]]
+// CHECK-NEXT: %[[val2:.*]]    = arith.subf %arg0, %[[val1]]
+// CHECK-NEXT: %[[lhs_mid:.*]] = arith.truncf %[[val2]]
+// CHECK-NEXT: %[[val4:.*]]    = arith.extf %[[lhs_mid]]
+// CHECK-NEXT: %[[val5:.*]]    = arith.subf %[[val2]], %[[val4]]
+// CHECK-NEXT: %[[lhs_lo:.*]]  = arith.truncf %[[val5]]
+
+// CHECK: %[[rhs_hi:.*]] = arith.truncf %arg1
+// CHECK-NEXT: %[[val8:.*]]    = arith.extf %[[rhs_hi]]
+// CHECK-NEXT: %[[val9:.*]]    = arith.subf %arg1, %[[val8]]
+// CHECK-NEXT: %[[rhs_mid:.*]] = arith.truncf %[[val9]]
+// CHECK-NEXT: %[[val11:.*]]   = arith.extf %[[rhs_mid]]
+// CHECK-NEXT: %[[val12:.*]]   = arith.subf %[[val9]], %[[val11]]
+// CHECK-NEXT: %[[rhs_lo:.*]]  = arith.truncf %[[val12]]
+
+// CHECK: %[[val14:.*]] = tt.dot %[[lhs_lo]],  %[[rhs_lo]]
+// CHECK-NEXT: %[[val15:.*]] = tt.dot %[[lhs_mid]], %[[rhs_lo]],  %[[val14]], inputPrecision = bf16
+// CHECK-NEXT: %[[val16:.*]] = tt.dot %[[lhs_lo]],  %[[rhs_mid]], %[[val15]], inputPrecision = bf16
+// CHECK-NEXT: %[[val17:.*]] = tt.dot %[[lhs_mid]], %[[rhs_mid]], %[[val16]], inputPrecision = bf16
+// CHECK-NEXT: %[[val18:.*]] = tt.dot %[[lhs_lo]],  %[[rhs_hi]],  %[[val17]], inputPrecision = bf16
+// CHECK-NEXT: %[[val19:.*]] = tt.dot %[[lhs_hi]],  %[[rhs_lo]],  %[[val18]], inputPrecision = bf16
+// CHECK-NEXT: %[[val20:.*]] = tt.dot %[[lhs_mid]], %[[rhs_hi]],  %[[val19]], inputPrecision = bf16
+// CHECK-NEXT: %[[val21:.*]] = tt.dot %[[lhs_hi]],  %[[rhs_mid]], %[[val20]], inputPrecision = bf16
+
+// CHECK: %[[val22:.*]] = arith.cmpf uno, %[[val21]], %[[val21]]
+// CHECK-NEXT: %[[val23:.*]] = arith.select %[[val22]]
+
+// CHECK: %[[val24:.*]] = tt.dot %[[lhs_hi]], %[[rhs_hi]], %[[val23]], inputPrecision = bf16
+// CHECK-NEXT: %[[val25:.*]] = arith.addf %[[val24]], %arg2
+
+module {
+  tt.func @dot_test(%arg0: tensor<16x16xf32>, %arg1: tensor<16x16xf32>, %arg2: tensor<16x16xf32>) -> tensor<16x16xf32> {
+    %4 = tt.dot %arg0, %arg1, %arg2, inputPrecision = bf16x3 : tensor<16x16xf32> * tensor<16x16xf32> -> tensor<16x16xf32>
+    tt.return %4 : tensor<16x16xf32>
+  }
+}
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -44,7 +44,7 @@ class HIPOptions:
     supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5", "fp8e5b16", "fp8e4b8")
     deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
     default_dot_input_precision: str = "ieee"
-    allowed_dot_input_precisions: Tuple[str] = ("ieee", )
+    allowed_dot_input_precisions: Tuple[str] = ("ieee", 'bf16x3')
     enable_fp_fusion: bool = True
     launch_cooperative_grid: bool = False
     matrix_instr_nonkdim: int = 0
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -121,7 +121,7 @@ class CUDAOptions:
     supported_fp8_dtypes: Tuple[str] = ("fp8e5", "fp8e4b15")
     deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
     default_dot_input_precision: str = "tf32"
-    allowed_dot_input_precisions: Tuple[str] = ("tf32", "tf32x3", "ieee")
+    allowed_dot_input_precisions: Tuple[str] = ("tf32", "tf32x3", "ieee", 'bf16x3')
     max_num_imprecise_acc_default: bool = None
     extern_libs: dict = None
     debug: bool = False
@@ -249,6 +249,7 @@ def make_ttgir(mod, metadata, opt, capability):
         passes.ttir.add_convert_to_ttgpuir(pm, f"cuda:{capability}", opt.num_warps, 32, opt.num_ctas)
         # optimize TTGIR
         passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_bf16x3_dot(pm)
         if capability // 10 >= 8:
             passes.ttgpuir.add_f32_dot_tc(pm)
         # TODO(Qingyi): Move PlanCTAPass to the front of CoalescePass

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,9 @@ def TT_InputPrecisionAttr : I32EnumAttr<`
`129`	`129`	`[`
`130`	`130`	`I32EnumAttrCase<"TF32", 0, "tf32">,`
`131`	`131`	`I32EnumAttrCase<"TF32x3", 1, "tf32x3">,`
`132`		`- I32EnumAttrCase<"IEEE", 2, "ieee">`
	`132`	`+ I32EnumAttrCase<"IEEE", 2, "ieee">,`
	`133`	`+ I32EnumAttrCase<"BF16", 3, "bf16">,`
	`134`	`+ I32EnumAttrCase<"BF16x3", 4, "bf16x3">`
`133`	`135`	`]>{`
`134`	`136`	`let cppNamespace = "::mlir::triton";`
`135`	`137`	`}`