|
| 1 | +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \ |
| 2 | +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s |
| 3 | + |
| 4 | +// Test: Scaled matmul (f4E2M1FN * f4E2M1FN with f8E8M0FNU scales) compiles |
| 5 | +// through the full pipeline with DMA config. This validates that the pipeline |
| 6 | +// handles sub-byte types correctly, including the narrow type emulation for |
| 7 | +// gather_to_lds ops. |
| 8 | + |
| 9 | +#pipeline_layout = #hal.pipeline.layout<bindings = [ |
| 10 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 11 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 12 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 13 | + #hal.pipeline.binding<storage_buffer, ReadOnly>, |
| 14 | + #hal.pipeline.binding<storage_buffer> |
| 15 | +]> |
| 16 | +#translation_info = #iree_codegen.translation_info<pipeline = |
| 17 | + LLVMGPUTileAndFuse |
| 18 | + workgroup_size = [512, 1, 1] |
| 19 | + subgroup_size = 64, |
| 20 | + { |
| 21 | + gpu_pipeline_options = #iree_gpu.pipeline_options< |
| 22 | + prefetch_num_stages = 2, |
| 23 | + no_reduce_shared_memory_bank_conflicts = true> |
| 24 | + } |
| 25 | +> |
| 26 | +#config = #iree_gpu.lowering_config<{ |
| 27 | + mma_kind = #iree_gpu.scaled_mma_layout< |
| 28 | + intrinsic = MFMA_SCALE_F32_16x16x128_B32, |
| 29 | + lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, |
| 30 | + acc_elem_type = f32>, |
| 31 | + promote_operands = [0, 1, 2, 3], |
| 32 | + promotion_types = [ |
| 33 | + #iree_gpu.use_global_load_dma, |
| 34 | + #iree_gpu.use_global_load_dma, |
| 35 | + #iree_gpu.derived_thread_config, |
| 36 | + #iree_gpu.derived_thread_config], |
| 37 | + reduction = [0, 0, 1, 1], |
| 38 | + subgroup = [4, 8, 0, 0], |
| 39 | + workgroup = [256, 256, 0, 0] |
| 40 | +}> |
| 41 | +#lhs_map = affine_map<(M, N, Ko, Kb) -> (M, Ko, Kb)> |
| 42 | +#rhs_map = affine_map<(M, N, Ko, Kb) -> (N, Ko, Kb)> |
| 43 | +#scale_m = affine_map<(M, N, Ko, Kb) -> (M, Ko)> |
| 44 | +#scale_n = affine_map<(M, N, Ko, Kb) -> (N, Ko)> |
| 45 | +#out_map = affine_map<(M, N, Ko, Kb) -> (M, N)> |
| 46 | +hal.executable public @main { |
| 47 | + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { |
| 48 | + hal.executable.export public @scaled_matmul_dma ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) { |
| 49 | + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() |
| 50 | + hal.return %x, %y, %z : index, index, index |
| 51 | + } |
| 52 | + builtin.module { |
| 53 | + func.func @scaled_matmul_dma() |
| 54 | + attributes {translation_info = #translation_info} { |
| 55 | + %cst = arith.constant 0.000000e+00 : f32 |
| 56 | + %c0 = arith.constant 0 : index |
| 57 | + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>> |
| 58 | + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>> |
| 59 | + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>> |
| 60 | + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>> |
| 61 | + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1024x1024xf32>> |
| 62 | + %A = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>> -> tensor<1024x512x32xf4E2M1FN> |
| 63 | + %B = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1024, 512, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>> -> tensor<1024x512x32xf4E2M1FN> |
| 64 | + %A_scales = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>> -> tensor<1024x512xf8E8M0FNU> |
| 65 | + %B_scales = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>> -> tensor<1024x512xf8E8M0FNU> |
| 66 | + %empty = tensor.empty() : tensor<1024x1024xf32> |
| 67 | + %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> |
| 68 | + %result = linalg.generic { |
| 69 | + indexing_maps = [#lhs_map, #rhs_map, #scale_m, #scale_n, #out_map], |
| 70 | + iterator_types = ["parallel", "parallel", "reduction", "reduction"] |
| 71 | + } ins(%A, %B, %A_scales, %B_scales : tensor<1024x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<1024x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%fill : tensor<1024x1024xf32>) attrs = {lowering_config = #config} { |
| 72 | + ^bb0(%a: f4E2M1FN, %b: f4E2M1FN, %a_scale: f8E8M0FNU, %b_scale: f8E8M0FNU, %out: f32): |
| 73 | + %s1 = arith.scaling_extf %a, %a_scale : f4E2M1FN, f8E8M0FNU to f32 |
| 74 | + %s2 = arith.scaling_extf %b, %b_scale : f4E2M1FN, f8E8M0FNU to f32 |
| 75 | + %m = arith.mulf %s1, %s2 : f32 |
| 76 | + %r = arith.addf %out, %m : f32 |
| 77 | + linalg.yield %r : f32 |
| 78 | + } -> tensor<1024x1024xf32> |
| 79 | + iree_tensor_ext.dispatch.tensor.store %result, %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1024x1024xf32>> |
| 80 | + return |
| 81 | + } |
| 82 | + } |
| 83 | + } |
| 84 | +} |
| 85 | + |
| 86 | +// Verify pipeline completes and produces scaled MFMA compute ops. |
| 87 | +// LHS/RHS are promoted to workgroup shared memory and scales use thread-based |
| 88 | +// copies. The compute uses 16x16x128 scaled MFMA instructions. |
| 89 | + |
| 90 | +// CHECK-LABEL: func.func @scaled_matmul_dma |
| 91 | +// CHECK-DAG: memref.alloc() : memref<{{.*}}xf8E8M0FNU, #gpu.address_space<workgroup>> |
| 92 | +// CHECK-DAG: memref.alloc() : memref<{{.*}}xf4E2M1FN, #gpu.address_space<workgroup>> |
| 93 | +// CHECK: scf.forall |
| 94 | +// CHECK: scf.for |
| 95 | +// CHECK: amdgpu.scaled_mfma 16x16x128 |
0 commit comments