triton-lang · jungpark-mlir · Aug 22, 2025 · Aug 26, 2025 · Sep 5, 2025 · Sep 5, 2025
@@ -95,6 +95,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::registerConvertTritonAMDGPUToLLVM();
   mlir::triton::registerConvertBuiltinFuncToLLVM();
   mlir::triton::registerOptimizeAMDLDSUsage();
+  mlir::triton::registerConvertWarpPipeline();
 
   mlir::ub::registerConvertUBToLLVMInterface(registry);
   mlir::registerConvertNVVMToLLVMInterface(registry);
@@ -115,6 +116,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUInThreadTranspose();
   mlir::registerTritonAMDGPUCoalesceAsyncCopy();
   mlir::registerTritonAMDGPUUpdateAsyncWaitCount();
+  mlir::registerTritonAMDGPUWarpPipeline();
   mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
   mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
   mlir::registerTritonAMDFoldTrueCmpI();

@@ -5,6 +5,7 @@
 #include <optional>
 #include <stdexcept>
 
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/Types.h"
@@ -830,10 +831,17 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttag::AsyncTDMCopyLocalToGlobalOp>(descPtr, indices,
                                                             src);
            })
-      .def("create_async_tdm_wait", [](GluonOpBuilder &self, int num) {
-        ValueRange tokens;
-        self.create<ttag::AsyncTDMWait>(tokens, num);
+      .def("create_async_tdm_wait",
+           [](GluonOpBuilder &self, int num) {
+             ValueRange tokens;
+             self.create<ttag::AsyncTDMWait>(tokens, num);
+           })
+      .def("create_warp_pipeline_border", [](GluonOpBuilder &self) {
+        auto border = self.create<ROCDL::SchedBarrier>(0);
+        border->setAttr("triton.warp_pipeline.border",
+                        self.getBuilder().getUnitAttr());
       });
+  ;
 
   m.def(
       "compute_tmem_reg_layout",

@@ -3000,6 +3000,27 @@ def kernel(bf16_ptr):
 """)
 
 
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4, HIP_TARGET_GFX1250])
+def test_amd_warp_pipeline(target):
+
+    @gluon.jit
+    def kernel():
+        c0: ttgl.constexpr = 0
+        one: ttgl.constexpr = 1
+
+        # Simple loop with an explicit split point
+        for i in range(c0, 10, one):
+            with ttgl.amd.warp_pipeline_stage("stage0"):
+                x = i + one
+            with ttgl.amd.warp_pipeline_stage("stage1"):
+                y = x * one
+                x = y + one
+
+    module = run_parser(kernel, *make_args(num_warps=4), target=target)
+    print(module)
+    assert str(module).count("triton.warp_pipeline.border") == 2
+
+
 @gluon.jit
 def print_num_warps():
     num_warps: ttgl.constexpr = ttgl.num_warps()

@@ -1,6 +1,13 @@
+from .._core import builtin
 from ._layouts import AMDMFMALayout, AMDWMMALayout
 from . import cdna3, cdna4
 from . import rdna3, rdna4
 from . import gfx1250
+from .warp_pipeline import warp_pipeline_stage
 
-__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4", "rdna3", "rdna4", "gfx1250"]
+__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4", "rdna3", "rdna4", "gfx1250", "warp_pipeline_stage"]
+
+
+@builtin
+def split_warp_pipeline(_semantic=None):
+    return _semantic.builder.create_warp_pipeline_border()
@@ -0,0 +1,26 @@
+from __future__ import annotations
+from typing import Optional
+
+
+class warp_pipeline_stage:
+    __slots__ = ("label", "_semantic")
+
+    def __init__(self, label: Optional[str] = None, **_internal_kwargs):
+        self.label = label
+        self._semantic = _internal_kwargs.pop("_semantic", None)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        if exc_type is not None:
+            return False
+        try:
+            from . import split_warp_pipeline
+            try:
+                split_warp_pipeline(_semantic=self._semantic)
+            except TypeError:
+                split_warp_pipeline()
+        except Exception:
+            pass
+        return False
@@ -0,0 +1,118 @@
+// RUN: triton-opt %s -convert-warp-pipeline | FileCheck %s
+
+// ---- 2-stage pipeline (basic) ----
+
+tt.func @two_stage_backend(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+
+  // Frontend has already annotated total stages.
+  scf.for %i = %c0 to %n step %c1 {
+
+    // Stage 0 cluster
+    scf.execute_region {
+      %a0 = arith.addi %i, %c1 : index
+      %x0 = arith.addi %a0, %c1 : index
+      scf.yield
+    } {triton.warp_pipeline.stage}
+
+    // Stage 1 cluster
+    scf.execute_region {
+      %a1 = arith.addi %i, %c1 : index
+      %x1 = arith.muli %a1, %c1 : index
+      scf.yield
+    } {triton.warp_pipeline.stage}
+
+    scf.yield
+  } {triton.warp_pipeline.pipelined_for}
+
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @two_stage_backend(
+// CHECK: %c0 = arith.constant 0 : index
+// CHECK: %c1 = arith.constant 1 : index
+// CHECK-NOT: no_inline
+
+// === Pre-loop sync + role setup ===
+// CHECK: gpu.barrier
+// CHECK: arith.divsi
+// CHECK: %[[WARPLOW:.+]] = arith.cmpi eq
+// CHECK: %[[WARPHIGH:.+]] = arith.cmpi ne
+// CHECK: amdg.cond_barrier %[[WARPHIGH]]
+
+// CHECK: scf.for
+// CHECK-NOT:   scf.execute_region
+// CHECK: rocdl.sched.barrier
+// CHECK: rocdl.s.barrier
+// CHECK: rocdl.sched.barrier
+// CHECK-NOT:   scf.execute_region
+
+// CHECK: amdg.cond_barrier %[[WARPLOW]]
+// CHECK: tt.return
+
+
+// ---- 3-stage pipeline (ensures multiple clusters handled) ----
+
+tt.func @three_stage_backend(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+
+    // Stage 0
+    scf.execute_region {
+      %x0 = arith.addi %i, %c1 : index
+      scf.yield
+    } {triton.warp_pipeline.stage}
+    // Stage 1
+    scf.execute_region {
+      %x1 = arith.muli %i, %c1 : index
+      scf.yield
+    } {triton.warp_pipeline.stage}
+    // Stage 2
+    scf.execute_region {
+      %x2 = arith.addi %i, %c1 : index
+      scf.yield
+    } {triton.warp_pipeline.stage}
+
+    scf.yield
+  } {triton.warp_pipeline.pipelined_for}
+
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @three_stage_backend(
+// CHECK-NOT: no_inline
+// CHECK: gpu.barrier
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// CHECK-NOT:   scf.execute_region
+// CHECK: rocdl.sched.barrier
+// CHECK: rocdl.s.barrier
+// CHECK: rocdl.sched.barrier
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
+
+
+// ---- Negative: no total_stages → pass should not touch the loop ----
+
+tt.func @no_total_stages(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+  scf.for %i = %c0 to %n step %c1 {
+    scf.execute_region {
+      %x = arith.addi %i, %c1 : index
+      scf.yield
+    }
+    scf.yield
+  }
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @no_total_stages(
+// CHECK-NOT: gpu.barrier
+// CHECK-NOT: amdg.cond_barrier
+// CHECK: scf.for
+// CHECK:   scf.execute_region
+// CHECK: tt.return
@@ -0,0 +1,97 @@
+// RUN: triton-opt %s -tritonamdgpu-warp-pipeline | FileCheck %s
+
+// ---- 3-stage example (two borders) ----
+
+tt.func @three_stage_example(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    // Stage 0 (before first border)
+    %a  = arith.addi %i, %c1 : index
+    %a2 = arith.muli %a, %c1 : index
+
+    // explicit split point → next stage begins
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border}
+
+    // Stage 1
+    %b  = arith.addi %a2, %i : index
+
+    // explicit split point → next stage begins
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border}
+
+    // Stage 2
+    %c  = arith.addi %b, %a : index
+    %d  = arith.muli %c, %c1 : index
+
+    scf.yield
+  }
+
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @three_stage_example(
+// CHECK: scf.for
+//
+// Inside the loop we expect exactly three execute_region clusters:
+// CHECK:   scf.execute_region
+// CHECK:   scf.execute_region
+// CHECK:   scf.execute_region
+// CHECK: triton.warp_pipeline.pipelined_for
+//
+// And the split markers must be gone:
+// CHECK-NOT: rocdl.sched.barrier
+// CHECK: tt.return
+
+
+// ---- 2-stage example (one border) ----
+
+tt.func @two_stage_example(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    // Stage 0
+    %x = arith.addi %i, %c1 : index
+
+    // split to Stage 1
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border}
+
+    // Stage 1
+    %y = arith.muli %x, %c1 : index
+
+    scf.yield
+  }
+
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @two_stage_example(
+// CHECK: scf.for
+// CHECK:   scf.execute_region
+// CHECK:   scf.execute_region
+// CHECK: triton.warp_pipeline.pipelined_for
+// CHECK-NOT: rocdl.sched.barrier
+// CHECK: tt.return
+
+
+// ---- Negative: no border → no structuring ----
+
+tt.func @no_split_example(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    %x = arith.addi %i, %c1 : index
+    %y = arith.muli %x, %c1 : index
+    scf.yield
+  }
+
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @no_split_example(
+// CHECK: scf.for
+// CHECK-NOT: scf.execute_region
+// CHECK-NOT: pipelined_for
+// CHECK: tt.return
@@ -267,6 +267,7 @@ def gluon_to_ttgir(src, metadata, options):
         passes.ttir.add_loop_aware_cse(pm)
         passes.gluon.add_canonicalizer(pm)
         passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+        amd.passes.ttgpuir.add_warp_pipeline(pm)
 
         pm.run(mod, 'gluon_to_ttgir')
         return mod
@@ -285,6 +286,7 @@ def make_llir(src, metadata, options):
         # LDS size is determined by provided arch name.
         custom_lds_size = 0
         amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
+        amd.passes.ttgpuir.add_warp_pipeline_conversion(pm)
         passes.convert.add_scf_to_cf(pm)
         passes.gluon.add_inliner(pm)
         passes.convert.add_index_to_llvmir(pm)

@@ -31,6 +31,7 @@ namespace mlir::triton::AMD {
 /// @return created pass
 std::unique_ptr<OperationPass<ModuleOp>>
 createOptimizeLDSUsagePass(StringRef arch, int32_t customLDSLimit = 0);
+std::unique_ptr<OperationPass<ModuleOp>> createConvertWarpPipelinePass();
 
 void runScalarizePackedFOpsPass(llvm::Function &F);
 

@@ -88,4 +88,15 @@ def TritonAMDGPULowerInstructionSchedHints : Pass<"triton-amdgpu-lower-insert-in
     ];
 }
 
+def ConvertWarpPipeline : Pass<"convert-warp-pipeline", "mlir::ModuleOp"> {
+    let summary = "Emit conditional barrier and inlines scf.execute_region for warp-pipeline";
+    let constructor = "mlir::triton::AMD::createConvertWarpPipelinePass()";
+
+    let dependentDialects = ["mlir::LLVM::LLVMDialect",
+                             "mlir::gpu::GPUDialect",
+                             "mlir::ROCDL::ROCDLDialect",
+                             "mlir::triton::amdgpu::TritonAMDGPUDialect"];
+
+}
+
 #endif
@@ -291,4 +291,14 @@ def TritonAMDGPUOptimizeDotOperands : Pass<"tritonamdgpu-optimize-dot-operands",
   ];
 }
 
+def TritonAMDGPUWarpPipeline: Pass<"tritonamdgpu-warp-pipeline", "mlir::ModuleOp"> {
+  let summary = "partition and pipeline";
+
+  let description = [{
+    This pass reorder instructions to interleave instructions from two warps on the same SIMD unit.
+  }];
+
+  let dependentDialects = ["mlir::ROCDL::ROCDLDialect, mlir::triton::amdgpu::TritonAMDGPUDialect"];
+}
+
 #endif
@@ -5,6 +5,7 @@ add_triton_library(TritonAMDGPUToLLVM
     BufferOpsEmitter.cpp
     TensorPtrOpsToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
+    ConvertWarpPipeline.cpp
     MemoryOpToLLVM.cpp
     MaskedOpsToLLVM.cpp
     DotOpToLLVM/FMA.cpp