Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
ce47bac
PoC impl of warp pipeline part #1
jungpark-mlir Aug 22, 2025
b97a7ac
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Aug 26, 2025
da9f34e
Add lowering warp pipeline
jungpark-mlir Sep 5, 2025
8999aba
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Sep 5, 2025
c81dd97
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Sep 20, 2025
e370a53
update
jungpark-mlir Sep 25, 2025
3ab6436
milestone 1
jungpark-mlir Sep 26, 2025
12a521b
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Sep 30, 2025
84d44be
actually insert fence/barrier
jungpark-mlir Sep 30, 2025
424c0e4
backup
jungpark-mlir Oct 1, 2025
8d9e559
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Oct 1, 2025
596f46c
Refactorize code
jungpark-mlir Oct 1, 2025
7bedeab
Fix inserting barrier
jungpark-mlir Oct 2, 2025
58051cb
Add support for gluon
jungpark-mlir Oct 13, 2025
50f41eb
Merge branch 'newpp2' into newpp
jungpark-mlir Oct 13, 2025
2c695de
Merge pull request #5 from jungpark-mlir/newpp
jungpark-mlir Oct 13, 2025
a17b120
Fix compilation
jungpark-mlir Oct 15, 2025
099bfe8
Merge branch 'triton-lang:main' into newpp2
jungpark-mlir Oct 15, 2025
7f1ddf0
Merge branch 'newpp' into newpp2
jungpark-mlir Oct 16, 2025
4697a26
Merge pull request #6 from jungpark-mlir/newpp2
jungpark-mlir Oct 16, 2025
9fafe4b
Merge branch 'newpp3' into newpp
jungpark-mlir Oct 21, 2025
ac3c75d
Merge pull request #7 from jungpark-mlir/newpp
jungpark-mlir Oct 21, 2025
acfd61c
improve code
jungpark-mlir Oct 21, 2025
0d7fcd2
Merge branch 'triton-lang:main' into newpp3
jungpark-mlir Oct 24, 2025
89ca6c4
Merge branch 'newpp2' into newpp3
jungpark-mlir Oct 27, 2025
f298346
Merge pull request #8 from jungpark-mlir/newpp3
jungpark-mlir Oct 27, 2025
4e69362
Add test and last clean up
jungpark-mlir Oct 29, 2025
012678f
Merge branch 'triton-lang:main' into newpp2
jungpark-mlir Oct 29, 2025
2715698
Fix leftovers
jungpark-mlir Oct 29, 2025
ef3f875
revert whitespace removal
jungpark-mlir Oct 29, 2025
fde1c36
Merge branch 'newpp' into newpp2
jungpark-mlir Oct 29, 2025
75154e0
Merge pull request #9 from jungpark-mlir/newpp2
jungpark-mlir Oct 29, 2025
90a2a7f
Improve implementation per review
jungpark-mlir Oct 31, 2025
c5315f4
Merge branch 'triton-lang:main' into newpp2
jungpark-mlir Oct 31, 2025
c1087af
Merge pull request #10 from jungpark-mlir/newpp2
jungpark-mlir Oct 31, 2025
3e45471
Remove extra canonicalization.
jungpark-mlir Oct 31, 2025
975a15c
Merge branch 'main' into newpp
jungpark-mlir Nov 3, 2025
f884827
Fix accidental mistype.
jungpark-mlir Nov 3, 2025
885ee54
Remove unused option.
jungpark-mlir Nov 3, 2025
f90e809
Change to use `with` to define pipeline.
jungpark-mlir Nov 10, 2025
85f4b21
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Nov 10, 2025
b3fa756
Merge branch 'triton-lang:main' into newpp
jungpark-mlir Nov 11, 2025
6bb284e
Fix test
jungpark-mlir Nov 11, 2025
96a1e28
Format
jungpark-mlir Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bin/RegisterTritonDialects.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::triton::registerConvertTritonAMDGPUToLLVM();
mlir::triton::registerConvertBuiltinFuncToLLVM();
mlir::triton::registerOptimizeAMDLDSUsage();
mlir::triton::registerConvertWarpPipeline();

mlir::ub::registerConvertUBToLLVMInterface(registry);
mlir::registerConvertNVVMToLLVMInterface(registry);
Expand All @@ -115,6 +116,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::registerTritonAMDGPUInThreadTranspose();
mlir::registerTritonAMDGPUCoalesceAsyncCopy();
mlir::registerTritonAMDGPUUpdateAsyncWaitCount();
mlir::registerTritonAMDGPUWarpPipeline();
mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
mlir::registerTritonAMDFoldTrueCmpI();
Expand Down
14 changes: 11 additions & 3 deletions python/src/gluon_ir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <optional>
#include <stdexcept>

#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectRegistry.h"
#include "mlir/IR/Types.h"
Expand Down Expand Up @@ -830,10 +831,17 @@ void init_gluon_ir(py::module &&m) {
self.create<ttag::AsyncTDMCopyLocalToGlobalOp>(descPtr, indices,
src);
})
.def("create_async_tdm_wait", [](GluonOpBuilder &self, int num) {
ValueRange tokens;
self.create<ttag::AsyncTDMWait>(tokens, num);
.def("create_async_tdm_wait",
[](GluonOpBuilder &self, int num) {
ValueRange tokens;
self.create<ttag::AsyncTDMWait>(tokens, num);
})
.def("create_warp_pipeline_border", [](GluonOpBuilder &self) {
auto border = self.create<ROCDL::SchedBarrier>(0);
border->setAttr("triton.warp_pipeline.border",
self.getBuilder().getUnitAttr());
});
;

m.def(
"compute_tmem_reg_layout",
Expand Down
21 changes: 21 additions & 0 deletions python/test/gluon/test_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3000,6 +3000,27 @@ def kernel(bf16_ptr):
""")


@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4, HIP_TARGET_GFX1250])
def test_amd_warp_pipeline(target):

@gluon.jit
def kernel():
c0: ttgl.constexpr = 0
one: ttgl.constexpr = 1

# Simple loop with an explicit split point
for i in range(c0, 10, one):
with ttgl.amd.warp_pipeline_stage("stage0"):
x = i + one
with ttgl.amd.warp_pipeline_stage("stage1"):
y = x * one
x = y + one

module = run_parser(kernel, *make_args(num_warps=4), target=target)
print(module)
assert str(module).count("triton.warp_pipeline.border") == 2


@gluon.jit
def print_num_warps():
num_warps: ttgl.constexpr = ttgl.num_warps()
Expand Down
9 changes: 8 additions & 1 deletion python/triton/experimental/gluon/language/amd/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from .._core import builtin
from ._layouts import AMDMFMALayout, AMDWMMALayout
from . import cdna3, cdna4
from . import rdna3, rdna4
from . import gfx1250
from .warp_pipeline import warp_pipeline_stage

__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4", "rdna3", "rdna4", "gfx1250"]
__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4", "rdna3", "rdna4", "gfx1250", "warp_pipeline_stage"]


@builtin
def split_warp_pipeline(_semantic=None):
return _semantic.builder.create_warp_pipeline_border()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this makes sense as a concept in gluon. How will tensors work if they are executed on only one warp, but the tensor layout is expecting multiple warps?

  1. Would the gl.warp_specialize API work instead?
  2. If not, could we generalize the API in a way that works also for AMD?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warp-pipelining does not restrict execution to a single warp, nor does it change the program’s semantics. With or without warp-pipelining, all warps execute the same code and produce the same results. Warp-pipelining simply shifts the timing of when different warps execute different stages of a loop, allowing them to overlap memory and compute phases more effectively. In other words, it controls when warps run each stage, not what they execute.

Historically, we achieved the same effect using the block-pingpong pass, which has flattened IR, manually partitioned the loop, and inserted all synchronization in-place. That approach worked but did not scale: every new kernel or architecture required manual scheduling, barrier placement, and tuning. warp-pipeline replaces that ad-hoc strategy with a structured IR representation, enabling systematic pipeline analysis and automation.

Warp-pipelining is fundamentally different from warp-specialization

  • Warp-pipelining: all warps execute the same code; no functional divergence; only timing differs.
  • Warp-specialization: different warps run different roles or code paths (e.g., loader warp vs. compute warp), and there is no notion of pipeline stage ordering.

We're also reviewing the support for warp-specialize but that's a separate effort.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, so IIUC, this is all about emitting the cond_barrier to delay some warps at runtime? Won't you just re-converge as soon as there is a barrier inside the loop? Also what difference do the stages make in that case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. The key idea behind using cond_barrier is that, warp groups diverge in time once they met a cond_barrier, but they don’t need to reconverge at the same program counter. Once one group is delayed and the other runs ahead, they continue to “leap-frog” each other by hitting different barrier sites in the loop.
This is because, the HW releases the barrier when all participating threads have reached a barrier but not necessarily the same barrier instruction. In other words, barriers are satisfied once every warp has arrived at the incoming barrier, even if those barriers are at different PCs. This allows two (or more) warp groups to keep synchronizing without ever reconverging, which naturally maintains the pipelined execution across iterations. At the end of the loop, we have to place a counter cond_barrier to reconverge the warps.

When I first explored this idea, I also assumed that barriers would only release when all threads reached the exact same barrier instruction. So I initially forced reconvergence in HW using a wrapper llvm.func containing a barrier, ensuring warps could conditionally funnel to one canonical barrier point from different points in the kernel. That version also worked but it turned out to be unnecessary for the current HW behavior and the cond_barrier implementation has been simplified as the current one.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So does that mean that for example a reduction or layout conversion inside the loop will fail because the barriers emitted during the lowering will not behave as expected?

Also you didn't answer why there need to be different stages. Could you get the same result by just exposing cond_barrier directly in gluon?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the question is “why not just expose cond_barrier and related ops directly,” that’s essentially what block-pingpong pass does today, and warp-pipeline is the structured improvement of that approach. Two key improvements are,

  1. cond_barrier could work differently per each warp, which violates block based TTG programming model
  2. Automate : currently it does the minimum analysis to determine the dependency across the pipeline stages which is slightly different from what Membar is currently does and eventually we'd like to use this IR structure and lowering method for auto-partitioning of the warp-pipeline. Across new kernels and hardware, block-pingpong repeatedly showed that dependency analysis and stage partitioning are both performance-critical and a common source of bugs.

I think this is totally fair question. This PR only contains support for the Gluon, this may look like extra structure. The motivation, though, is to support auto-partitioning in the future and ensure a consistent IR design for both manual and automated pipelines.

And you're right, warp-pipeline cannot simply work for the ops like reductions or layout conversions that inherently require warp-wide synchronization. It can only work if the sync points align with stage boundaries. In those cases, you structure the pipeline so the required barrier happens at the end of a stage. If the required synchronization cannot align with a stage boundary, then warp-pipeline simply isn’t appropriate for that loop. Users (and eventually auto-partitioning) decide when this optimization works effectively.

Copy link
Contributor

@peterbell10 peterbell10 Nov 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay I think I misunderstood something you said earlier. I was thinking cond_barrier was only participated in by the conditioned threads, but IIUC, actually all threads participate in the barrier but at different points in the program. So for the first barrier, the upper threads are blocked while the lower execute the first stage. Then the lower half waits at the end of the second stage for the upper half to finish the first stage and so on. I can see why that would be problematic.

To be transparent though, I'm not sure I like the direction of having functions that are really annotations that change what's happening in the surrounding context. I'm fine with having new language constructs in general, I just want to make sure it fits in with a reasonable programming model. I especially don't like that the legality of using a function would depends on what other functions are called in the same loop...

Would you be open to implementing a different syntax? Something like:

for i in range(...):
    with amd.warp_pipeline_stage():
        x = i + one
    with amd.warp_pipeline_stage():
            y = x * one
            x = y + one

IMO this makes it clearer that the operations are happening within a warp-pipelined context.

I also think you should raise an error if any of the ops require a barrier, as silent correctness issues that depend on implementation details of an op's lowering doesn't sound like a great user experience.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you be open to implementing a different syntax? Something like:

That sounds a great idea to explore. Let me try.

I also think you should raise

Sure, that's definitely in the plan. It's not always impossible to use warp-pipeline with those algorithms but depends on the dependency between the operation and its users, e.g., it's fine if synchronization can be deferred to the cluster boundary. Current analysis only looks for the dependency requires local memory fence but checking illegal dependency will be added.

26 changes: 26 additions & 0 deletions python/triton/experimental/gluon/language/amd/warp_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from __future__ import annotations
from typing import Optional


class warp_pipeline_stage:
__slots__ = ("label", "_semantic")

def __init__(self, label: Optional[str] = None, **_internal_kwargs):
self.label = label
self._semantic = _internal_kwargs.pop("_semantic", None)

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
if exc_type is not None:
return False
try:
from . import split_warp_pipeline
try:
split_warp_pipeline(_semantic=self._semantic)
except TypeError:
split_warp_pipeline()
except Exception:
pass
return False
118 changes: 118 additions & 0 deletions test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// RUN: triton-opt %s -convert-warp-pipeline | FileCheck %s

// ---- 2-stage pipeline (basic) ----

tt.func @two_stage_backend(%n: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

// Frontend has already annotated total stages.
scf.for %i = %c0 to %n step %c1 {

// Stage 0 cluster
scf.execute_region {
%a0 = arith.addi %i, %c1 : index
%x0 = arith.addi %a0, %c1 : index
scf.yield
} {triton.warp_pipeline.stage}

// Stage 1 cluster
scf.execute_region {
%a1 = arith.addi %i, %c1 : index
%x1 = arith.muli %a1, %c1 : index
scf.yield
} {triton.warp_pipeline.stage}

scf.yield
} {triton.warp_pipeline.pipelined_for}

tt.return
}

// CHECK-LABEL: tt.func @two_stage_backend(
// CHECK: %c0 = arith.constant 0 : index
// CHECK: %c1 = arith.constant 1 : index
// CHECK-NOT: no_inline

// === Pre-loop sync + role setup ===
// CHECK: gpu.barrier
// CHECK: arith.divsi
// CHECK: %[[WARPLOW:.+]] = arith.cmpi eq
// CHECK: %[[WARPHIGH:.+]] = arith.cmpi ne
// CHECK: amdg.cond_barrier %[[WARPHIGH]]

// CHECK: scf.for
// CHECK-NOT: scf.execute_region
// CHECK: rocdl.sched.barrier
// CHECK: rocdl.s.barrier
// CHECK: rocdl.sched.barrier
// CHECK-NOT: scf.execute_region

// CHECK: amdg.cond_barrier %[[WARPLOW]]
// CHECK: tt.return


// ---- 3-stage pipeline (ensures multiple clusters handled) ----

tt.func @three_stage_backend(%n: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

scf.for %i = %c0 to %n step %c1 {

// Stage 0
scf.execute_region {
%x0 = arith.addi %i, %c1 : index
scf.yield
} {triton.warp_pipeline.stage}
// Stage 1
scf.execute_region {
%x1 = arith.muli %i, %c1 : index
scf.yield
} {triton.warp_pipeline.stage}
// Stage 2
scf.execute_region {
%x2 = arith.addi %i, %c1 : index
scf.yield
} {triton.warp_pipeline.stage}

scf.yield
} {triton.warp_pipeline.pipelined_for}

tt.return
}

// CHECK-LABEL: tt.func @three_stage_backend(
// CHECK-NOT: no_inline
// CHECK: gpu.barrier
// CHECK: amdg.cond_barrier
// CHECK: scf.for
// CHECK-NOT: scf.execute_region
// CHECK: rocdl.sched.barrier
// CHECK: rocdl.s.barrier
// CHECK: rocdl.sched.barrier
// CHECK: amdg.cond_barrier
// CHECK: tt.return


// ---- Negative: no total_stages → pass should not touch the loop ----

tt.func @no_total_stages(%n: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
scf.for %i = %c0 to %n step %c1 {
scf.execute_region {
%x = arith.addi %i, %c1 : index
scf.yield
}
scf.yield
}
tt.return
}

// CHECK-LABEL: tt.func @no_total_stages(
// CHECK-NOT: gpu.barrier
// CHECK-NOT: amdg.cond_barrier
// CHECK: scf.for
// CHECK: scf.execute_region
// CHECK: tt.return
97 changes: 97 additions & 0 deletions test/TritonGPU/amd/amd-warp-pipeline.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// RUN: triton-opt %s -tritonamdgpu-warp-pipeline | FileCheck %s

// ---- 3-stage example (two borders) ----

tt.func @three_stage_example(%n: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

scf.for %i = %c0 to %n step %c1 {
// Stage 0 (before first border)
%a = arith.addi %i, %c1 : index
%a2 = arith.muli %a, %c1 : index

// explicit split point → next stage begins
rocdl.sched.barrier 0 {triton.warp_pipeline.border}

// Stage 1
%b = arith.addi %a2, %i : index

// explicit split point → next stage begins
rocdl.sched.barrier 0 {triton.warp_pipeline.border}

// Stage 2
%c = arith.addi %b, %a : index
%d = arith.muli %c, %c1 : index

scf.yield
}

tt.return
}

// CHECK-LABEL: tt.func @three_stage_example(
// CHECK: scf.for
//
// Inside the loop we expect exactly three execute_region clusters:
// CHECK: scf.execute_region
// CHECK: scf.execute_region
// CHECK: scf.execute_region
// CHECK: triton.warp_pipeline.pipelined_for
//
// And the split markers must be gone:
// CHECK-NOT: rocdl.sched.barrier
// CHECK: tt.return


// ---- 2-stage example (one border) ----

tt.func @two_stage_example(%n: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

scf.for %i = %c0 to %n step %c1 {
// Stage 0
%x = arith.addi %i, %c1 : index

// split to Stage 1
rocdl.sched.barrier 0 {triton.warp_pipeline.border}

// Stage 1
%y = arith.muli %x, %c1 : index

scf.yield
}

tt.return
}

// CHECK-LABEL: tt.func @two_stage_example(
// CHECK: scf.for
// CHECK: scf.execute_region
// CHECK: scf.execute_region
// CHECK: triton.warp_pipeline.pipelined_for
// CHECK-NOT: rocdl.sched.barrier
// CHECK: tt.return


// ---- Negative: no border → no structuring ----

tt.func @no_split_example(%n: index) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index

scf.for %i = %c0 to %n step %c1 {
%x = arith.addi %i, %c1 : index
%y = arith.muli %x, %c1 : index
scf.yield
}

tt.return
}

// CHECK-LABEL: tt.func @no_split_example(
// CHECK: scf.for
// CHECK-NOT: scf.execute_region
// CHECK-NOT: pipelined_for
// CHECK: tt.return
2 changes: 2 additions & 0 deletions third_party/amd/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ def gluon_to_ttgir(src, metadata, options):
passes.ttir.add_loop_aware_cse(pm)
passes.gluon.add_canonicalizer(pm)
passes.ttgpuir.add_combine_tensor_select_and_if(pm)
amd.passes.ttgpuir.add_warp_pipeline(pm)

pm.run(mod, 'gluon_to_ttgir')
return mod
Expand All @@ -285,6 +286,7 @@ def make_llir(src, metadata, options):
# LDS size is determined by provided arch name.
custom_lds_size = 0
amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
amd.passes.ttgpuir.add_warp_pipeline_conversion(pm)
passes.convert.add_scf_to_cf(pm)
passes.gluon.add_inliner(pm)
passes.convert.add_index_to_llvmir(pm)
Expand Down
1 change: 1 addition & 0 deletions third_party/amd/include/TritonAMDGPUToLLVM/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace mlir::triton::AMD {
/// @return created pass
std::unique_ptr<OperationPass<ModuleOp>>
createOptimizeLDSUsagePass(StringRef arch, int32_t customLDSLimit = 0);
std::unique_ptr<OperationPass<ModuleOp>> createConvertWarpPipelinePass();

void runScalarizePackedFOpsPass(llvm::Function &F);

Expand Down
11 changes: 11 additions & 0 deletions third_party/amd/include/TritonAMDGPUToLLVM/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,15 @@ def TritonAMDGPULowerInstructionSchedHints : Pass<"triton-amdgpu-lower-insert-in
];
}

def ConvertWarpPipeline : Pass<"convert-warp-pipeline", "mlir::ModuleOp"> {
let summary = "Emit conditional barrier and inlines scf.execute_region for warp-pipeline";
let constructor = "mlir::triton::AMD::createConvertWarpPipelinePass()";

let dependentDialects = ["mlir::LLVM::LLVMDialect",
"mlir::gpu::GPUDialect",
"mlir::ROCDL::ROCDLDialect",
"mlir::triton::amdgpu::TritonAMDGPUDialect"];

}

#endif
10 changes: 10 additions & 0 deletions third_party/amd/include/TritonAMDGPUTransforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -291,4 +291,14 @@ def TritonAMDGPUOptimizeDotOperands : Pass<"tritonamdgpu-optimize-dot-operands",
];
}

def TritonAMDGPUWarpPipeline: Pass<"tritonamdgpu-warp-pipeline", "mlir::ModuleOp"> {
let summary = "partition and pipeline";

let description = [{
This pass reorder instructions to interleave instructions from two warps on the same SIMD unit.
}];

let dependentDialects = ["mlir::ROCDL::ROCDLDialect, mlir::triton::amdgpu::TritonAMDGPUDialect"];
}

#endif
1 change: 1 addition & 0 deletions third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ add_triton_library(TritonAMDGPUToLLVM
BufferOpsEmitter.cpp
TensorPtrOpsToLLVM.cpp
ConvertLayoutOpToLLVM.cpp
ConvertWarpPipeline.cpp
MemoryOpToLLVM.cpp
MaskedOpsToLLVM.cpp
DotOpToLLVM/FMA.cpp
Expand Down
Loading
Loading