|
1 | | -// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1 |
2 | | -// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2 |
3 | | -// RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritonamdgpu-accelerate-matmul="arch-generation-name=gfx942 matrix-instruction-size=16 kPack=1" -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" -triton-amdgpu-insert-instruction-sched-hints="variant=local_prefetch" -tritongpu-reduce-data-duplication -optimize-amd-lds-usage="target-arch=gfx942" -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm="arch=gfx942" -triton-amdgpu-lower-insert-instruction-sched-hints="arch=gfx942 num_stages=2" -debug-only="lower-insert-instruction-sched-hints" -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD |
4 | 1 | // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=1" | FileCheck %s -check-prefix=LABELING_PS_1 |
5 | 2 | // RUN: triton-opt %s -convert-triton-to-tritongpu="target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64" -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline="num_stages=2" | FileCheck %s -check-prefix=LABELING_PS_2 |
6 | 3 |
|
7 | 4 | module { |
8 | | - // INSTR_COUNT_NS1-LABEL: @test_dot_op |
9 | | - // INSTR_COUNT_NS2-LABEL: @test_dot_op |
10 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: @test_dot_op |
11 | 5 | // LABELING_PS_1-LABEL: @test_dot_op |
12 | 6 | // LABELING_PS_2-LABEL: @test_dot_op |
13 | 7 | tt.func @test_dot_op(%lb : index, %ub : index, %step : index, |
@@ -40,96 +34,6 @@ module { |
40 | 34 | %a = tt.load %a_ptr : tensor<128x32x!tt.ptr<f16>> |
41 | 35 | %b = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr<f16>> |
42 | 36 |
|
43 | | - // INSTR_COUNT_NS1: amdgpu.instruction_sched_hint |
44 | | - // INSTR_COUNT_NS1-SAME: isBufferLoadsAEnabled = false |
45 | | - // INSTR_COUNT_NS1-SAME: isBufferLoadsBEnabled = false |
46 | | - // INSTR_COUNT_NS1-SAME: numDsReadsA = #amdgpu.InstCounter<8, vector<4xf16>> |
47 | | - // INSTR_COUNT_NS1-SAME: numDsReadsB = #amdgpu.InstCounter<32, vector<1xf16>> |
48 | | - // INSTR_COUNT_NS1-SAME: numDsWritesA = #amdgpu.InstCounter<0, none> |
49 | | - // INSTR_COUNT_NS1-SAME: numDsWritesB = #amdgpu.InstCounter<0, none> |
50 | | - // INSTR_COUNT_NS1-SAME: numGlobalLoadsA = #amdgpu.InstCounter<4, vector<4xf16>> |
51 | | - // INSTR_COUNT_NS1-SAME: numGlobalLoadsB = #amdgpu.InstCounter<4, vector<4xf16>> |
52 | | - // INSTR_COUNT_NS1-SAME: numMMAs = #amdgpu.InstCounter<16, tensor<32x32x8xf16>> |
53 | | - |
54 | | - // INSTR_COUNT_NS2: amdgpu.instruction_sched_hint |
55 | | - // INSTR_COUNT_NS2-SAME: isBufferLoadsAEnabled = false |
56 | | - // INSTR_COUNT_NS2-SAME: isBufferLoadsBEnabled = false |
57 | | - // INSTR_COUNT_NS2-SAME: numDsReadsA = #amdgpu.InstCounter<8, vector<4xf16>> |
58 | | - // INSTR_COUNT_NS2-SAME: numDsReadsB = #amdgpu.InstCounter<32, vector<1xf16>> |
59 | | - // INSTR_COUNT_NS2-SAME: numDsWritesA = #amdgpu.InstCounter<4, vector<4xf16>> |
60 | | - // INSTR_COUNT_NS2-SAME: numDsWritesB = #amdgpu.InstCounter<4, vector<4xf16>> |
61 | | - // INSTR_COUNT_NS2-SAME: numGlobalLoadsA = #amdgpu.InstCounter<4, vector<4xf16>> |
62 | | - // INSTR_COUNT_NS2-SAME: numGlobalLoadsB = #amdgpu.InstCounter<4, vector<4xf16>> |
63 | | - // INSTR_COUNT_NS2-SAME: numMMAs = #amdgpu.InstCounter<16, tensor<32x32x8xf16>> |
64 | | - |
65 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.barrier [[SCHED_GUARD:.+]] |
66 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE:512]], 1, 0 |
67 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA:8]], 1, 0 |
68 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ:32]], 1, 0 |
69 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
70 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
71 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
72 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
73 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
74 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
75 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
76 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
77 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
78 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
79 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
80 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
81 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
82 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
83 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
84 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
85 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
86 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
87 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_WRITE]], 1, 0 |
88 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
89 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[VMEM_READ]], 1, 0 |
90 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ:256]], 2, 0 |
91 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
92 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
93 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
94 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
95 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
96 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
97 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
98 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
99 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
100 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
101 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
102 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
103 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
104 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
105 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
106 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
107 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
108 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
109 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
110 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
111 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
112 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
113 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
114 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
115 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
116 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
117 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
118 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
119 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
120 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
121 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
122 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
123 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
124 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
125 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
126 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
127 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
128 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[DS_READ]], 2, 0 |
129 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.group.barrier [[MFMA]], 1, 0 |
130 | | - // USE_LOCAL_PREFETCH_GLOBAL_LOAD: rocdl.sched.barrier [[SCHED_GUARD]] |
131 | | - |
132 | | - |
133 | 37 | // LABELING_PS_1: scf.for |
134 | 38 | // LABELING_PS_1: %[[REG0_OP0:.+]] = tt.load {{.*}} {OpIdx = #amdgpu.OpIdx<0>} |
135 | 39 | // LABELING_PS_1: %[[REG0_OP1:.+]] = tt.load {{.*}} {OpIdx = #amdgpu.OpIdx<1>} |
|
0 commit comments