Skip to content

Assertion `false && "computeCapability not supported"' failed. #6859

@ssuukk

Description

@ssuukk

Describe the bug

A log told me to share this with the Triton project:

`Non-blocking memory transfer: True

TeaCache: Using cache device: cpu
Sampling 37 frames at 960x1696 with 60 steps
HiDream: ComfyUI is unloading all models, cleaning HiDream cache...
HiDream: Cleaning up all cached models...
HiDream: Cache cleared
0%| | 0/60 [00:00<?, ?it/s]
module {
tt.func public @_attn_fwd(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 1
%cst = arith.constant dense<1.000000e+00> : tensor<128xf32>
%cst_0 = arith.constant dense<0xFF800000> : tensor<128xf32>
%c0_i32 = arith.constant 0 : i32
%cst_1 = arith.constant dense<0> : tensor<128x64xi32>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<128x128xf16>
%c1_i32 = arith.constant 1 : i32
%cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xf32>
%c63_i32 = arith.constant 63 : i32
%c64_i32 = arith.constant 64 : i32
%c127_i32 = arith.constant 127 : i32
%c128_i32 = arith.constant 128 : i32
%c12_i64 = arith.constant 12 : i64
%0 = tt.get_program_id x : i32
%1 = tt.get_program_id z : i32
%2 = arith.extsi %1 : i32 to i64
%3 = tt.get_program_id y : i32
%4 = arith.extsi %3 : i32 to i64
%5 = arith.muli %2, %c12_i64 : i64
%6 = arith.addi %5, %4 : i64
%7 = arith.addi %arg18, %c127_i32 : i32
%8 = arith.divsi %7, %c128_i32 : i32
%9 = arith.extsi %8 : i32 to i64
%10 = arith.muli %6, %9 : i64
%11 = arith.addi %arg19, %c63_i32 : i32
%12 = arith.divsi %11, %c64_i32 : i32
%13 = arith.extsi %12 : i32 to i64
%14 = arith.muli %6, %13 : i64
%15 = arith.muli %0, %c128_i32 : i32
%16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
%17 = tt.splat %15 : i32 -> tensor<128xi32>
%18 = arith.addi %17, %16 : tensor<128xi32>
%19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
%20 = arith.extsi %arg6 : i32 to i64
%21 = arith.muli %2, %20 : i64
%22 = arith.extsi %arg7 : i32 to i64
%23 = arith.muli %4, %22 : i64
%24 = arith.addi %21, %23 : i64
%25 = tt.addptr %arg0, %24 : !tt.ptr, i64
%26 = tt.expand_dims %18 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
%27 = tt.splat %arg8 : i32 -> tensor<128x1xi32>
%28 = arith.muli %26, %27 : tensor<128x1xi32>
%29 = tt.splat %25 : !tt.ptr -> tensor<128x1x!tt.ptr>
%30 = tt.addptr %29, %28 : tensor<128x1x!tt.ptr>, tensor<128x1xi32>
%31 = tt.expand_dims %16 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32>
%32 = tt.broadcast %30 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr>
%33 = tt.broadcast %31 : tensor<1x128xi32> -> tensor<128x128xi32>
%34 = tt.addptr %32, %33 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
%35 = tt.addptr %arg3, %10 : !tt.ptr, i64
%36 = tt.addptr %35, %0 : !tt.ptr, i32
%37 = arith.extsi %arg9 : i32 to i64
%38 = arith.muli %2, %37 : i64
%39 = arith.extsi %arg10 : i32 to i64
%40 = arith.muli %4, %39 : i64
%41 = arith.addi %38, %40 : i64
%42 = tt.addptr %arg1, %41 : !tt.ptr, i64
%43 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32>
%44 = tt.splat %arg11 : i32 -> tensor<1x64xi32>
%45 = arith.muli %43, %44 : tensor<1x64xi32>
%46 = tt.splat %42 : !tt.ptr -> tensor<1x64x!tt.ptr>
%47 = tt.addptr %46, %45 : tensor<1x64x!tt.ptr>, tensor<1x64xi32>
%48 = tt.expand_dims %16 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
%49 = tt.broadcast %47 : tensor<1x64x!tt.ptr> -> tensor<128x64x!tt.ptr>
%50 = tt.broadcast %48 : tensor<128x1xi32> -> tensor<128x64xi32>
%51 = tt.addptr %49, %50 : tensor<128x64x!tt.ptr>, tensor<128x64xi32>
%52 = tt.addptr %arg4, %14 : !tt.ptr, i64
%53 = arith.extsi %arg12 : i32 to i64
%54 = arith.muli %2, %53 : i64
%55 = arith.extsi %arg13 : i32 to i64
%56 = arith.muli %4, %55 : i64
%57 = arith.addi %54, %56 : i64
%58 = tt.addptr %arg2, %57 : !tt.ptr, i64
%59 = tt.expand_dims %19 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32>
%60 = tt.splat %arg14 : i32 -> tensor<64x1xi32>
%61 = arith.muli %59, %60 : tensor<64x1xi32>
%62 = tt.splat %58 : !tt.ptr -> tensor<64x1x!tt.ptr>
%63 = tt.addptr %62, %61 : tensor<64x1x!tt.ptr>, tensor<64x1xi32>
%64 = tt.broadcast %63 : tensor<64x1x!tt.ptr> -> tensor<64x128x!tt.ptr>
%65 = tt.broadcast %31 : tensor<1x128xi32> -> tensor<64x128xi32>
%66 = tt.addptr %64, %65 : tensor<64x128x!tt.ptr>, tensor<64x128xi32>
%67 = arith.extsi %arg15 : i32 to i64
%68 = arith.muli %2, %67 : i64
%69 = arith.extsi %arg16 : i32 to i64
%70 = arith.muli %4, %69 : i64
%71 = arith.addi %68, %70 : i64
%72 = tt.addptr %arg5, %71 : !tt.ptr, i64
%73 = tt.splat %arg17 : i32 -> tensor<128x1xi32>
%74 = arith.muli %26, %73 : tensor<128x1xi32>
%75 = tt.splat %72 : !tt.ptr -> tensor<128x1x!tt.ptr>
%76 = tt.addptr %75, %74 : tensor<128x1x!tt.ptr>, tensor<128x1xi32>
%77 = tt.broadcast %76 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr>
%78 = tt.addptr %77, %33 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
%79 = tt.splat %arg18 : i32 -> tensor<128x1xi32>
%80 = arith.cmpi slt, %26, %79 : tensor<128x1xi32>
%81 = tt.broadcast %80 : tensor<128x1xi1> -> tensor<128x128xi1>
%82 = tt.load %34, %81 : tensor<128x128x!tt.ptr>
%83 = tt.load %36 : !tt.ptr
%84:6 = scf.for %arg20 = %c0_i32 to %arg19 step %c64_i32 iter_args(%arg21 = %cst, %arg22 = %cst_3, %arg23 = %cst_0, %arg24 = %51, %arg25 = %52, %arg26 = %66) -> (
%89 = arith.subi %arg19, %arg20 : i32
%90 = tt.splat %89 : i32 -> tensor<1x64xi32>
%91 = arith.cmpi slt, %43, %90 : tensor<1x64xi32>
%92 = tt.broadcast %91 : tensor<1x64xi1> -> tensor<128x64xi1>
%93 = tt.load %arg24, %92 : tensor<128x64x!tt.ptr>
%94 = tt.load %arg25 : !tt.ptr
%95 = tt.dot %82, %93, %cst_1, inputPrecision = tf32 : tensor<128x128xi8> * tensor<128x64xi8> -> tensor<128x64xi32>
%96 = arith.sitofp %95 : tensor<128x64xi32> to tensor<128x64xf32>
%97 = tt.splat %83 : f32 -> tensor<128x64xf32>
%98 = arith.mulf %96, %97 : tensor<128x64xf32>
%99 = tt.splat %94 : f32 -> tensor<128x64xf32>
%100 = arith.mulf %98, %99 : tensor<128x64xf32>
%101 = "tt.reduce"(%100) <{axis = 1 : i32}> ({
^bb0(%arg27: f32, %arg28: f32):
%130 = arith.maxnumf %arg27, %arg28 : f32
tt.reduce.return %130 : f32
}) : (tensor<128x64xf32>) -> tensor<128xf32>
%102 = arith.maxnumf %arg23, %101 : tensor<128xf32>
%103 = tt.expand_dims %102 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32>
%104 = tt.broadcast %103 : tensor<128x1xf32> -> tensor<128x64xf32>
%105 = arith.subf %100, %104 : tensor<128x64xf32>
%106 = math.exp2 %105 : tensor<128x64xf32>
%107 = "tt.reduce"(%106) <{axis = 1 : i32}> ({
^bb0(%arg27: f32, %arg28: f32):
%130 = arith.addf %arg27, %arg28 : f32
tt.reduce.return %130 : f32
}) : (tensor<128x64xf32>) -> tensor<128xf32>
%108 = arith.subf %arg23, %102 : tensor<128xf32>
%109 = math.exp2 %108 : tensor<128xf32>
%110 = arith.mulf %arg21, %109 : tensor<128xf32>
%111 = arith.addf %110, %107 : tensor<128xf32>
%112 = tt.expand_dims %109 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32>
%113 = tt.broadcast %112 : tensor<128x1xf32> -> tensor<128x128xf32>
%114 = arith.mulf %arg22, %113 : tensor<128x128xf32>
%115 = tt.splat %89 : i32 -> tensor<64x1xi32>
%116 = arith.cmpi slt, %59, %115 : tensor<64x1xi32>
%117 = tt.broadcast %116 : tensor<64x1xi1> -> tensor<64x128xi1>
%118 = tt.load %arg26, %117 : tensor<64x128x!tt.ptr>
%119 = arith.truncf %106 : tensor<128x64xf32> to tensor<128x64xf16>
%120 = tt.dot %119, %118, %cst_2, inputPrecision = tf32 : tensor<128x64xf16> * tensor<64x128xf16> -> tensor<128x128xf16>
%121 = arith.extf %120 : tensor<128x128xf16> to tensor<128x128xf32>
%122 = arith.addf %114, %121 : tensor<128x128xf32>
%123 = arith.muli %arg11, %c64_i32 : i32
%124 = tt.splat %123 : i32 -> tensor<128x64xi32>
%125 = tt.addptr %arg24, %124 : tensor<128x64x!tt.ptr>, tensor<128x64xi32>
%126 = tt.addptr %arg25, %c1_i32 : !tt.ptr, i32
%127 = arith.muli %arg14, %c64_i32 : i32
%128 = tt.splat %127 : i32 -> tensor<64x128xi32>
%129 = tt.addptr %arg26, %128 : tensor<64x128x!tt.ptr>, tensor<64x128xi32>
scf.yield %111, %122, %102, %125, %126, %129 : tensor<128xf32>, tensor<128x128xf32>, tensor<128xf32>, tensor<128x64x!tt.ptr>, !tt.ptr, tensor<64x128x!tt.ptr>
} {tt.divisibility_arg1 = dense<64> : tensor<1xi32>}
%85 = tt.expand_dims %84#0 {axis = 1 : i32} : tensor<128xf32> -> tensor<128x1xf32>
%86 = tt.broadcast %85 : tensor<128x1xf32> -> tensor<128x128xf32>
%87 = arith.divf %84#1, %86 : tensor<128x128xf32>
%88 = arith.truncf %87 : tensor<128x128xf32> to tensor<128x128xf16>
tt.store %78, %88, %81 : tensor<128x128x!tt.ptr>
tt.return
}
}

{-#
external_resources: {
mlir_reproducer: {
pipeline: "builtin.module(convert-triton-to-tritongpu{num-ctas=1 num-warps=8 target=cuda:120 threads-per-warp=32}, tritongpu-coalesce, tritongpu-F32DotTC, triton-nvidia-gpu-plan-cta, tritongpu-remove-layout-conversions, tritongpu-optimize-thread-locality, tritongpu-accelerate-matmul, tritongpu-remove-layout-conversions, tritongpu-optimize-dot-operands{hoist-layout-conversion=true}, cse, tritongpu-fuse-nested-loops, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, loop-invariant-code-motion, tritongpu-optimize-accumulator-init, tritongpu-warp-spec-task-partition{num-consumer-groups=0}, triton-gpu-taskid-propagate{num-consumer-groups=0}, tritongpu-warp-spec-data-partition{num-consumer-groups=0}, tritongpu-warp-spec-code-partition{consumer-reg-inc=0 num-buffers=0 num-consumer-groups=0 producer-reg-dec=0}, tritongpu-pipeline{dump-intermediate-steps=false num-stages=4}, tritongpu-combine-tensor-select-and-if, tritongpu-promote-lhs-to-tmem, tritongpu-keep-acc-in-tmem, tritongpu-warp-spec-lowering{num-consumer-groups=0}, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritongpu-prefetch, tritongpu-optimize-dot-operands{hoist-layout-conversion=true}, tritongpu-coalesce-async-copy, tritongpu-remove-layout-conversions, tritongpu-reduce-data-duplication, tritongpu-reorder-instructions, cse, symbol-dce, triton-nvidia-gpu-fence-insertion{compute-capability=90}, triton-nvidia-tma-lowering, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, tritongpu-warp-spec-canonicalization{num-consumer-groups=0})",
disable_threading: false,
verify_each: true
}
}
#-}
/home/qus/AI/ComfyUI/.venv/lib/python3.12/site-packages/sageattention/attn_qk_int8_per_block.py:40:0: error: Failures have been detected while processing an MLIR pass pipeline
/home/qus/AI/ComfyUI/.venv/lib/python3.12/site-packages/sageattention/attn_qk_int8_per_block.py:40:0: note: Pipeline failed while executing [TritonGPUAccelerateMatmul on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.``

Environment details

Triton: 3.3.0
GPU: 5090

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions