microsoft · manbearian · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -10,6 +10,6 @@ jobs:
   call-workflow:
     uses: ./.github/workflows/test-plugin.yml
     with:
-        # [BACKEND] Fix layouts in nvgpu RewriteTensorPointer. (#2634)
-        triton-ref: '0e3bf3f58061bd62725e7c502e7e0af19690df1c'
+        # [BACKEND] Refactor wgmma descriptor creation
+        triton-ref: '56c284cf7e39f249cdf1d8d5dba7892deb0286d6'
         triton-shared-ref: ${{ github.ref }}
diff --git a/README.md b/README.md
@@ -102,7 +102,7 @@ Important details to note:
 %reinterpret_cast = memref.reinterpret_cast %arg2 to offset: [...] memref<*xf32> to memref<1024xf32>
 %extracted_slice = tensor.extract_slice %15[0] [%21] [1] : tensor<1024xf32> to tensor<?xf32>
 %subview = memref.subview %reinterpret_cast[0] [%21] [1] : memref<1024xf32> to memref<?xf32>
-memref.tensor_store %extracted_slice, %subview : memref<?xf32>
+bufferization.materialize_in_destination %extracted_slice in writable %subview
 ```
 
 + element-wise `arith` and `math` operators are converted to their corresponding `linalg.generic` version.

diff --git a/lib/Conversion/TritonToLinalg/TritonToLinalg.cpp b/lib/Conversion/TritonToLinalg/TritonToLinalg.cpp
@@ -744,7 +744,9 @@ struct StoreConverter : public OpConversionPattern<triton::StoreOp> {
 
     // 1. Simple case where no mask is used.
     if (!mask) {
-      rewriter.create<memref::TensorStoreOp>(loc, val, ptr);
+      auto storeOp = rewriter.create<bufferization::MaterializeInDestinationOp>(
+          loc, val, ptr);
+      storeOp.setWritable(true);
       rewriter.eraseOp(op);
       return success();
     }
@@ -761,7 +763,9 @@ struct StoreConverter : public OpConversionPattern<triton::StoreOp> {
     auto srcSlice = mstate.getExtractSlice(val, loc, rewriter);
     auto dstSubview = mstate.getSubview(ptr, loc, rewriter);
 
-    rewriter.create<memref::TensorStoreOp>(loc, srcSlice, dstSubview);
+    auto storeOp = rewriter.create<bufferization::MaterializeInDestinationOp>(
+        loc, srcSlice, dstSubview);
+    storeOp.setWritable(true);
     rewriter.eraseOp(op);
 
     return success();

diff --git a/test/Conversion/TritonToLinalg/addptr_2d_example.mlir b/test/Conversion/TritonToLinalg/addptr_2d_example.mlir
@@ -64,6 +64,6 @@ module {
 // CHECK:           } -> tensor<4x256xbf16>
 // CHECK:           %[[VAL_21:.*]] = arith.index_cast %[[VAL_3]] : i32 to index
 // CHECK:           %[[VAL_22:.*]] = memref.reinterpret_cast %[[VAL_2]] to offset: {{\[}}%[[VAL_21]]], sizes: [4, 256], strides: [1, %[[VAL_7]]] : memref<*xbf16> to memref<4x256xbf16, strided<[1, ?], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_23:.*]], %[[VAL_22]] : memref<4x256xbf16, strided<[1, ?], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_23:.*]] in writable %[[VAL_22]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_add_value.mlir b/test/Conversion/TritonToLinalg/addptr_add_value.mlir
@@ -63,6 +63,6 @@ module {
 // CHECK:           %[[VAL_19:.*]] = memref.alloc() : memref<4x256xbf16>
 // CHECK:           memref.copy %[[VAL_13]], %[[VAL_19]] : memref<4x256xbf16, strided<[1, ?], offset: ?>> to memref<4x256xbf16>
 // CHECK:           %[[VAL_20:.*]] = bufferization.to_tensor %[[VAL_19]] restrict writable : memref<4x256xbf16>
-// CHECK:           memref.tensor_store %[[VAL_20]], %[[VAL_18]] : memref<4x256xbf16, strided<[1, ?], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_20]] in writable %[[VAL_18]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_dim1.mlir b/test/Conversion/TritonToLinalg/addptr_dim1.mlir
@@ -81,7 +81,7 @@ module {
 // CHECK-DAG:       [[VAR_2_:%.+]] = bufferization.to_tensor [[RES_]] restrict writable : memref<1x256xbf16>
 // CHECK-DAG:       [[VAR_3_:%.+]] = arith.index_cast [[PARAM_1_]] : i32 to index
 // CHECK:           [[VAR_reinterpret_cast_0_:%.+]] = memref.reinterpret_cast [[PARAM_0_]] to offset: {{.}}[[VAR_3_]]{{.}}, sizes: [1, 256], strides: [256, 1] : memref<*xbf16> to memref<1x256xbf16, strided<[256, 1], offset: ?>>
-// CHECK:           memref.tensor_store [[VAR_2_]], [[VAR_reinterpret_cast_0_]] : memref<1x256xbf16, strided<[256, 1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination [[VAR_2_]] in writable [[VAR_reinterpret_cast_0_]]
 // CHECK-DAG:       [[VAR_4_:%.+]]:3 = scf.for [[VAR_arg5_:%.+]] = [[CST_0_]] to [[CST_12_]] step [[CST_3_]] iter_args([[VAR_arg6_:%.+]] = [[VAR_1_]], [[VAR_arg7_:%.+]] = [[CST_0_]], [[VAR_arg8_:%.+]] = [[CST_0_]]) -> (tensor<4x256xbf16>, index, index) {
 // CHECK-DAG:         [[VAR_5_:%.+]] = arith.index_cast [[VAR_arg5_]] : index to i32
 // CHECK:             [[VAR_6_:%.+]] = arith.muli [[VAR_5_]], [[CST_256_1_]] : i32
@@ -102,6 +102,6 @@ module {
 // CHECK:             scf.yield [[VAR_10_]], [[VAR_12_]], [[CST_0_]] : tensor<4x256xbf16>, index, index
 // CHECK:           }
 // CHECK:           [[VAR_reinterpret_cast_1_:%.+]] = memref.reinterpret_cast [[PARAM_0_]] to offset: [0], sizes: [4, 256], strides: {{.}}[[CST_256_]], 1] : memref<*xbf16> to memref<4x256xbf16, strided<[?, 1]>>
-// CHECK:           memref.tensor_store [[VAR_4_]]#0, [[VAR_reinterpret_cast_1_]] : memref<4x256xbf16, strided<[?, 1]>>
+// CHECK:           bufferization.materialize_in_destination [[VAR_4_]]#0 in writable [[VAR_reinterpret_cast_1_]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_for_accumulation.mlir b/test/Conversion/TritonToLinalg/addptr_for_accumulation.mlir
@@ -88,6 +88,6 @@ module {
 // CHECK:           }
 // CHECK:           %[[VAL_36:.*]] = arith.index_cast %[[VAL_3]] : i32 to index
 // CHECK:           %[[VAL_37:.*]] = memref.reinterpret_cast %[[VAL_2]] to offset: {{\[}}%[[VAL_36]]], sizes: [4, 256], strides: [1, %[[VAL_8]]] : memref<*xbf16> to memref<4x256xbf16, strided<[1, ?], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_38:.*]]#0, %[[VAL_37]] : memref<4x256xbf16, strided<[1, ?], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_38:.*]]#0 in writable %[[VAL_37]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_for_expand_ptr.mlir b/test/Conversion/TritonToLinalg/addptr_for_expand_ptr.mlir
@@ -65,7 +65,7 @@ module {
 // CHECK:             %[[VAL_15:.*]] = memref.alloc() : memref<256x256xbf16>
 // CHECK:             memref.copy %[[VAL_14]], %[[VAL_15]] : memref<256x256xbf16, strided<[?, 1], offset: ?>> to memref<256x256xbf16>
 // CHECK:             %[[VAL_16:.*]] = bufferization.to_tensor %[[VAL_15]] restrict writable : memref<256x256xbf16>
-// CHECK:             memref.tensor_store %[[VAL_16]], %[[VAL_14]] : memref<256x256xbf16, strided<[?, 1], offset: ?>>
+// CHECK:             bufferization.materialize_in_destination %[[VAL_16]] in writable %[[VAL_14]]
 // CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_12]], %[[VAL_9]] : index
 // CHECK:             scf.yield %[[VAL_17]] : index
 // CHECK:           }

diff --git a/test/Conversion/TritonToLinalg/addptr_for_more_init_args.mlir b/test/Conversion/TritonToLinalg/addptr_for_more_init_args.mlir
@@ -55,7 +55,7 @@ module {
 // CHECK:             %[[VAL_23:.*]] = memref.alloc() : memref<256xbf16>
 // CHECK:             memref.copy %[[VAL_17]], %[[VAL_23]] : memref<256xbf16, strided<[?], offset: ?>> to memref<256xbf16>
 // CHECK:             %[[VAL_24:.*]] = bufferization.to_tensor %[[VAL_23]] restrict writable : memref<256xbf16>
-// CHECK:             memref.tensor_store %[[VAL_24]], %[[VAL_19]] : memref<256xbf16, strided<[?], offset: ?>>
+// CHECK:             bufferization.materialize_in_destination %[[VAL_24]] in writable %[[VAL_19]]
 // CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_21]], %[[VAL_10]] : index
 // CHECK:             %[[VAL_26:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_25]]], sizes: [256], strides: {{\[}}%[[VAL_8]]] : memref<*xbf16> to memref<256xbf16, strided<[?], offset: ?>>
 // CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_16]], %[[VAL_10]] : index

diff --git a/test/Conversion/TritonToLinalg/addptr_for_used_after_update.mlir b/test/Conversion/TritonToLinalg/addptr_for_used_after_update.mlir
@@ -91,7 +91,7 @@ module {
 // CHECK:             %[[VAL_14:.*]] = memref.alloc() : memref<256xbf16>
 // CHECK:             memref.copy %[[VAL_13]], %[[VAL_14]] : memref<256xbf16, strided<[?], offset: ?>> to memref<256xbf16>
 // CHECK:             %[[VAL_15:.*]] = bufferization.to_tensor %[[VAL_14]] restrict writable : memref<256xbf16>
-// CHECK:             memref.tensor_store %[[VAL_15]], %[[VAL_13]] : memref<256xbf16, strided<[?], offset: ?>>
+// CHECK:             bufferization.materialize_in_destination %[[VAL_15]] in writable %[[VAL_13]]
 // CHECK:             scf.yield %[[VAL_12]] : index
 // CHECK:           }
 // CHECK:           return

diff --git a/test/Conversion/TritonToLinalg/addptr_for_used_before_update.mlir b/test/Conversion/TritonToLinalg/addptr_for_used_before_update.mlir
@@ -46,7 +46,7 @@ module {
 // CHECK:             %[[VAL_14:.*]] = memref.alloc() : memref<256xbf16>
 // CHECK:             memref.copy %[[VAL_12]], %[[VAL_14]] : memref<256xbf16, strided<[?], offset: ?>> to memref<256xbf16>
 // CHECK:             %[[VAL_15:.*]] = bufferization.to_tensor %[[VAL_14]] restrict writable : memref<256xbf16>
-// CHECK:             memref.tensor_store %[[VAL_15]], %[[VAL_12]] : memref<256xbf16, strided<[?], offset: ?>>
+// CHECK:             bufferization.materialize_in_destination %[[VAL_15]] in writable %[[VAL_12]]
 // CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_13]], %[[VAL_8]] : index
 // CHECK:             %[[VAL_17:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_16]]], sizes: [256], strides: {{\[}}%[[VAL_4]]] : memref<*xbf16> to memref<256xbf16, strided<[?], offset: ?>>
 // CHECK:             scf.yield %[[VAL_17]], %[[VAL_16]] : memref<256xbf16, strided<[?], offset: ?>>, index

diff --git a/test/Conversion/TritonToLinalg/addptr_loopback.mlir b/test/Conversion/TritonToLinalg/addptr_loopback.mlir
@@ -48,6 +48,6 @@ module {
 // CHECK:           %[[VAL_11:.*]] = memref.alloc() : memref<4x256xbf16>
 // CHECK:           memref.copy %[[VAL_8]], %[[VAL_11]] : memref<4x256xbf16, strided<[1, ?], offset: ?>> to memref<4x256xbf16>
 // CHECK:           %[[VAL_12:.*]] = bufferization.to_tensor %[[VAL_11]] restrict writable : memref<4x256xbf16>
-// CHECK:           memref.tensor_store %[[VAL_12]], %[[VAL_10]] : memref<4x256xbf16, strided<[1, ?], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_12]] in writable %[[VAL_10]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_mul_const_const.mlir b/test/Conversion/TritonToLinalg/addptr_mul_const_const.mlir
@@ -44,6 +44,6 @@ module {
 // CHECK:           %[[VAL_13:.*]] = memref.alloc() : memref<1024xbf16>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_13]] : memref<1024xbf16, strided<[?], offset: ?>> to memref<1024xbf16>
 // CHECK:           %[[VAL_14:.*]] = bufferization.to_tensor %[[VAL_13]] restrict writable : memref<1024xbf16>
-// CHECK:           memref.tensor_store %[[VAL_14]], %[[VAL_12]] : memref<1024xbf16, strided<[1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_14]] in writable %[[VAL_12]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_mul_value_const.mlir b/test/Conversion/TritonToLinalg/addptr_mul_value_const.mlir
@@ -46,6 +46,6 @@ module {
 // CHECK:           %[[VAL_18:.*]] = memref.alloc() : memref<1024xbf16>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_18]] : memref<1024xbf16, strided<[?], offset: ?>> to memref<1024xbf16>
 // CHECK:           %[[VAL_19:.*]] = bufferization.to_tensor %[[VAL_18]] restrict writable : memref<1024xbf16>
-// CHECK:           memref.tensor_store %[[VAL_19]], %[[VAL_17]] : memref<1024xbf16, strided<[1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_19]] in writable %[[VAL_17]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_nested.mlir b/test/Conversion/TritonToLinalg/addptr_nested.mlir
@@ -68,6 +68,6 @@ module {
 // CHECK:           %[[VAL_26:.*]] = arith.index_cast %[[VAL_1]] : i32 to index
 // CHECK:           %[[VAL_27:.*]] = arith.addi %[[VAL_25]], %[[VAL_26]] : index
 // CHECK:           %[[VAL_28:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_27]]], sizes: [4, 256], strides: [3, %[[VAL_5]]] : memref<*xbf16> to memref<4x256xbf16, strided<[3, ?], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_29:.*]], %[[VAL_28]] : memref<4x256xbf16, strided<[3, ?], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_29:.*]] in writable %[[VAL_28]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_reshape_broadcast.mlir b/test/Conversion/TritonToLinalg/addptr_reshape_broadcast.mlir
@@ -38,6 +38,6 @@ module {
 // CHECK:           %[[VAL_8:.*]] = memref.alloc() : memref<256x128xbf16>
 // CHECK:           memref.copy %[[VAL_7]], %[[VAL_8]] : memref<256x128xbf16, strided<[1, ?], offset: 6656>> to memref<256x128xbf16>
 // CHECK:           %[[VAL_9:.*]] = bufferization.to_tensor %[[VAL_8]] restrict writable : memref<256x128xbf16>
-// CHECK:           memref.tensor_store %[[VAL_9]], %[[VAL_7]] : memref<256x128xbf16, strided<[1, ?], offset: 6656>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_9]] in writable %[[VAL_7]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_scalar_broadcast.mlir b/test/Conversion/TritonToLinalg/addptr_scalar_broadcast.mlir
@@ -60,6 +60,6 @@ module {
 // CHECK:           %[[VAL_17:.*]] = arith.muli %[[ARG_8]], %[[VAL_3]] : i32
 // CHECK:           %[[VAL_18:.*]] = arith.index_cast %[[VAL_17]] : i32 to index
 // CHECK:           %[[VAL_19:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_18]]], sizes: [1024, 1024], strides: [1, 1] : memref<*xf32> to memref<1024x1024xf32, strided<[1, 1], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_20:.*]], %[[VAL_19]] : memref<1024x1024xf32, strided<[1, 1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_20:.*]] in writable %[[VAL_19]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_scalar_for.mlir b/test/Conversion/TritonToLinalg/addptr_scalar_for.mlir
@@ -65,6 +65,6 @@ module {
 // CHECK:           %[[VAL_35:.*]] = arith.muli %[[ARG_8]], %[[VAL_3]] : i32
 // CHECK:           %[[VAL_36:.*]] = arith.index_cast %[[VAL_35]] : i32 to index
 // CHECK:           %[[VAL_37:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_36]]], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_38:.*]]#0, %[[VAL_37]] : memref<1024xf32, strided<[1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_38:.*]]#0 in writable %[[VAL_37]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_scalar_for_2d.mlir b/test/Conversion/TritonToLinalg/addptr_scalar_for_2d.mlir
@@ -87,6 +87,6 @@ module {
 // CHECK:           %[[VAL_38:.*]] = arith.index_cast %[[VAL_37]] : i32 to index
 // CHECK:           %[[VAL_39:.*]] = arith.addi %[[VAL_38]], %[[VAL_8]] : index
 // CHECK:           %[[VAL_40:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_39]]], sizes: [128, 128], strides: [1, 1] : memref<*xf32> to memref<128x128xf32, strided<[1, 1], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_41:.*]]#0, %[[VAL_40]] : memref<128x128xf32, strided<[1, 1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_41:.*]]#0 in writable %[[VAL_40]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_scalar_nested.mlir b/test/Conversion/TritonToLinalg/addptr_scalar_nested.mlir
@@ -52,6 +52,6 @@ module {
 // CHECK:           %[[VAL_23:.*]] = arith.muli %[[ARG_8]], %[[VAL_3]] : i32
 // CHECK:           %[[VAL_24:.*]] = arith.index_cast %[[VAL_23]] : i32 to index
 // CHECK:           %[[VAL_25:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_24]]], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_26:.*]], %[[VAL_25]] : memref<1024xf32, strided<[1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_26:.*]] in writable %[[VAL_25]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_scalar_splat.mlir b/test/Conversion/TritonToLinalg/addptr_scalar_splat.mlir
@@ -40,6 +40,6 @@ module {
 // CHECK:           %[[VAL_17:.*]] = arith.muli %[[ARG_8]], %[[VAL_3]] : i32
 // CHECK:           %[[VAL_18:.*]] = arith.index_cast %[[VAL_17]] : i32 to index
 // CHECK:           %[[VAL_19:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_18]]], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_20:.*]], %[[VAL_19]] : memref<1024xf32, strided<[1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_20:.*]] in writable %[[VAL_19]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/addptr_scalar_splat_2d.mlir b/test/Conversion/TritonToLinalg/addptr_scalar_splat_2d.mlir
@@ -51,6 +51,6 @@ module {
 // CHECK:           %[[VAL_20:.*]] = arith.index_cast %[[VAL_19]] : i32 to index
 // CHECK:           %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_8]] : index
 // CHECK:           %[[VAL_22:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: {{\[}}%[[VAL_21]]], sizes: [128, 128], strides: [1, 1] : memref<*xf32> to memref<128x128xf32, strided<[1, 1], offset: ?>>
-// CHECK:           memref.tensor_store %[[VAL_23:.*]], %[[VAL_22]] : memref<128x128xf32, strided<[1, 1], offset: ?>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_23:.*]] in writable %[[VAL_22]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/arith_not_ptr_arith.mlir b/test/Conversion/TritonToLinalg/arith_not_ptr_arith.mlir
@@ -34,6 +34,6 @@ module {
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_13]] : i32
 // CHECK:             linalg.yield %[[VAL_15]] : i32
 // CHECK:           } -> tensor<1024xi32>
-// CHECK:           memref.tensor_store %[[VAL_16:.*]], %[[VAL_6]] : memref<1024xi32, strided<[1]>>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_16:.*]] in writable %[[VAL_6]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/bitcast.mlir b/test/Conversion/TritonToLinalg/bitcast.mlir
@@ -37,7 +37,7 @@ module {
 // CHECK:     [[VAR_5_:%.+]] = arith.bitcast %in : i32 to f32
 // CHECK:     linalg.yield [[VAR_5_]] : f32
 // CHECK:   } -> tensor<1024xf32>
-// CHECK:   memref.tensor_store [[VAR_2_]], [[RC_0_]] : memref<1024xf32, strided<[1]>>
+// CHECK:   bufferization.materialize_in_destination [[VAR_2_]] in writable [[RC_0_]]
 // CHECK:     return
 // CHECK:   }
 // CHECK: }

diff --git a/test/Conversion/TritonToLinalg/block_ptr_advance.mlir b/test/Conversion/TritonToLinalg/block_ptr_advance.mlir
@@ -87,7 +87,7 @@ module {
 // CHECK:     %15 = arith.muli %11, %13 : index
 // CHECK:     %16 = arith.addi %14, %15 : index
 // CHECK:     %reinterpret_cast_1 = memref.reinterpret_cast %arg2 to offset: [%16], sizes: [128, 64], strides: [%12, %13] : memref<*xbf16> to memref<128x64xbf16, strided<[?, ?], offset: ?>>
-// CHECK:     memref.tensor_store %8#0, %reinterpret_cast_1 : memref<128x64xbf16, strided<[?, ?], offset: ?>>
+// CHECK:     bufferization.materialize_in_destination %8#0 in writable %reinterpret_cast_1
 // CHECK:     return
 // CHECK:   }
 // CHECK: }
diff --git a/test/Conversion/TritonToLinalg/convert_1d_elemwise_arith_binary.mlir b/test/Conversion/TritonToLinalg/convert_1d_elemwise_arith_binary.mlir
@@ -67,6 +67,6 @@ module {
 // CHECK:             %[[VAL_48:.*]] = arith.select %[[VAL_44]], %[[VAL_45]], %[[VAL_46]] : f32
 // CHECK:             linalg.yield %[[VAL_48]] : f32
 // CHECK:           } -> tensor<1024xf32>
-// CHECK:           memref.tensor_store %[[VAL_49:.*]], %[[VAL_2]] : memref<1024xf32>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_49:.*]] in writable %[[VAL_2]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/convert_1d_elemwise_arith_ternary.mlir b/test/Conversion/TritonToLinalg/convert_1d_elemwise_arith_ternary.mlir
@@ -44,6 +44,6 @@ module {
 // CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_19]] : f32
 // CHECK:             linalg.yield %[[VAL_21]] : f32
 // CHECK:           } -> tensor<1024xf32>
-// CHECK:           memref.tensor_store %[[VAL_22:.*]], %[[VAL_3]] : memref<1024xf32>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_22:.*]] in writable %[[VAL_3]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/convert_1d_elemwise_arith_unary.mlir b/test/Conversion/TritonToLinalg/convert_1d_elemwise_arith_unary.mlir
@@ -79,10 +79,10 @@ module {
 // CHECK:             %[[VAL_42:.*]] = math.sqrt %[[VAL_40]] : f32
 // CHECK:             linalg.yield %[[VAL_42]] : f32
 // CHECK:           } -> tensor<1024xf32>
-// CHECK:           memref.tensor_store %[[VAL_43:.*]], %[[VAL_3]] : memref<1024xbf16>
-// CHECK:           memref.tensor_store %[[VAL_44:.*]], %[[VAL_4]] : memref<1024xf32>
-// CHECK:           memref.tensor_store %[[VAL_45:.*]], %[[VAL_5]] : memref<1024xf32>
-// CHECK:           memref.tensor_store %[[VAL_46:.*]], %[[VAL_6]] : memref<1024xf32>
-// CHECK:           memref.tensor_store %[[VAL_47:.*]], %[[VAL_7]] : memref<1024xf32>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_43:.*]] in writable %[[VAL_3]]
+// CHECK:           bufferization.materialize_in_destination %[[VAL_44:.*]] in writable %[[VAL_4]]
+// CHECK:           bufferization.materialize_in_destination %[[VAL_45:.*]] in writable %[[VAL_5]]
+// CHECK:           bufferization.materialize_in_destination %[[VAL_46:.*]] in writable %[[VAL_6]]
+// CHECK:           bufferization.materialize_in_destination %[[VAL_47:.*]] in writable %[[VAL_7]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/convert_2d_elemwise_arith_binary.mlir b/test/Conversion/TritonToLinalg/convert_2d_elemwise_arith_binary.mlir
@@ -49,7 +49,7 @@ module {
 // CHECK:             %[[VAL_22:.*]] = arith.subf %[[VAL_19]], %[[VAL_20]] : f32
 // CHECK:             linalg.yield %[[VAL_22]] : f32
 // CHECK:           } -> tensor<128x128xf32>
-// CHECK:           memref.tensor_store %[[VAL_23:.*]], %[[VAL_2]] : memref<128x128xf32>
-// CHECK:           memref.tensor_store %[[VAL_24:.*]], %[[VAL_3]] : memref<128x128xf32>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_23:.*]] in writable %[[VAL_2]]
+// CHECK:           bufferization.materialize_in_destination %[[VAL_24:.*]] in writable %[[VAL_3]]
 // CHECK:           return
 // CHECK:         }
diff --git a/test/Conversion/TritonToLinalg/convert_2d_elemwise_arith_ternary.mlir b/test/Conversion/TritonToLinalg/convert_2d_elemwise_arith_ternary.mlir
@@ -50,6 +50,6 @@ module {
 // CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_19]] : f32
 // CHECK:             linalg.yield %[[VAL_21]] : f32
 // CHECK:           } -> tensor<128x128xf32>
-// CHECK:           memref.tensor_store %[[VAL_22:.*]], %[[VAL_3]] : memref<128x128xf32>
+// CHECK:           bufferization.materialize_in_destination %[[VAL_22:.*]] in writable %[[VAL_3]]
 // CHECK:           return
 // CHECK:         }