Skip to content

Conversation

@tgymnich
Copy link
Member

@tgymnich tgymnich commented Jan 4, 2024

Intrinsics like @air.max.s64 don't seem to work on Intel Macs, however @air.max.s.i64 does work.

Interestingly the @air.max.s64 intrinsic does work on M-series chips just fine and the final AGX is the same as with @air.max.s.i64.

The Apple Metal compiler seems to prefer the latter intrinsic style (@air.max.s.i64):

#include <metal_stdlib>
using namespace metal;


kernel void add_arrays(device const long2* inA,
                       device const long2* inB,
                       device long2* result,
                       uint index [[thread_position_in_grid]])
{
  result[index] = max(inA[index], inB[index]);
}
source_filename = "add_arrays"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
target triple = "air64-apple-macosx14.0.0"

; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
define void @add_arrays(<2 x i64> addrspace(1)* nocapture noundef readonly "air-buffer-no-alias" %0, <2 x i64> addrspace(1)* nocapture noundef readonly "air-buffer-no-alias" %1, <2 x i64> addrspace(1)* nocapture noundef writeonly "air-buffer-no-alias" %2, i32 noundef %3) local_unnamed_addr #0 !dbg !33 {
  %5 = zext i32 %3 to i64, !dbg !35
  %6 = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %0, i64 %5, !dbg !35
  %7 = load <2 x i64>, <2 x i64> addrspace(1)* %6, align 16, !dbg !35, !tbaa !36, !alias.scope !39, !noalias !42
  %8 = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %1, i64 %5, !dbg !45
  %9 = load <2 x i64>, <2 x i64> addrspace(1)* %8, align 16, !dbg !45, !tbaa !36, !alias.scope !46, !noalias !47
  %10 = tail call <2 x i64> @air.max.s.v2i64(<2 x i64> %7, <2 x i64> %9) #2, !dbg !48
  %11 = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %2, i64 %5, !dbg !52
  store <2 x i64> %10, <2 x i64> addrspace(1)* %11, align 16, !dbg !53, !tbaa !36, !alias.scope !54, !noalias !55
  ret void, !dbg !56
}

; Function Attrs: mustprogress nofree nosync nounwind readnone willreturn
declare <2 x i64> @air.max.s.v2i64(<2 x i64>, <2 x i64>) local_unnamed_addr #1

attributes #0 = { argmemonly mustprogress nofree nosync nounwind willreturn "approx-func-fp-math"="true" "frame-pointer"="all" "min-legal-vector-width"="128" "no-builtins" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" }
attributes #1 = { mustprogress nofree nosync nounwind readnone willreturn }
attributes #2 = { nounwind readnone willreturn }

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
!llvm.ident = !{!20}
!air.version = !{!21}
!air.language_version = !{!22}
!air.compile_options = !{!23, !24, !25}
!air.kernel = !{!26}

!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "Apple metal version 32023.35 (metalfe-32023.35)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, imports: !2, splitDebugInlining: false, nameTableKind: None, sysroot: "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk", sdk: "MacOSX14.0.sdk")
!1 = !DIFile(filename: "/Users/tim/Developer/mtltest/mtltest/mtltest.metal", directory: "/Users/tim/Developer/mtltest")
!2 = !{!3, !6}
!3 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !4, file: !5, line: 1)
!4 = !DIModule(scope: null, name: "metal_types", includePath: "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/metal/macos/lib/clang/32023.35/include/metal")
!5 = !DIFile(filename: "<built-in>", directory: "/Users/tim/Developer/mtltest")
!6 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !7, file: !8, line: 8)
!7 = !DIModule(scope: null, name: "metal_stdlib", includePath: "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/metal/macos/lib/clang/32023.35/include/metal")
!8 = !DIFile(filename: "mtltest/mtltest.metal", directory: "/Users/tim/Developer/mtltest")
!9 = !{i32 2, !"SDK Version", [2 x i32] [i32 14, i32 0]}
!10 = !{i32 7, !"Dwarf Version", i32 4}
!11 = !{i32 2, !"Debug Info Version", i32 3}
!12 = !{i32 1, !"wchar_size", i32 4}
!13 = !{i32 7, !"frame-pointer", i32 2}
!14 = !{i32 7, !"air.max_device_buffers", i32 31}
!15 = !{i32 7, !"air.max_constant_buffers", i32 31}
!16 = !{i32 7, !"air.max_threadgroup_buffers", i32 31}
!17 = !{i32 7, !"air.max_textures", i32 128}
!18 = !{i32 7, !"air.max_read_write_textures", i32 8}
!19 = !{i32 7, !"air.max_samplers", i32 16}
!20 = !{!"Apple metal version 32023.35 (metalfe-32023.35)"}
!21 = !{i32 2, i32 6, i32 0}
!22 = !{!"Metal", i32 3, i32 1, i32 0}
!23 = !{!"air.compile.denorms_disable"}
!24 = !{!"air.compile.fast_math_enable"}
!25 = !{!"air.compile.framebuffer_fetch_enable"}
!26 = !{void (<2 x i64> addrspace(1)*, <2 x i64> addrspace(1)*, <2 x i64> addrspace(1)*, i32)* @add_arrays, !27, !28}
!27 = !{}
!28 = !{!29, !30, !31, !32}
!29 = !{i32 0, !"air.buffer", !"air.location_index", i32 0, i32 1, !"air.read", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"long2", !"air.arg_name", !"inA"}
!30 = !{i32 1, !"air.buffer", !"air.location_index", i32 1, i32 1, !"air.read", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"long2", !"air.arg_name", !"inB"}
!31 = !{i32 2, !"air.buffer", !"air.location_index", i32 2, i32 1, !"air.read_write", !"air.address_space", i32 1, !"air.arg_type_size", i32 16, !"air.arg_type_align_size", i32 16, !"air.arg_type_name", !"long2", !"air.arg_name", !"result"}
!32 = !{i32 3, !"air.thread_position_in_grid", !"air.arg_type_name", !"uint", !"air.arg_name", !"index"}
!33 = distinct !DISubprogram(name: "add_arrays", scope: !8, file: !8, line: 12, type: !34, scopeLine: 16, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !27)
!34 = !DISubroutineType(types: !27)
!35 = !DILocation(line: 17, column: 23, scope: !33)
!36 = !{!37, !37, i64 0}
!37 = !{!"omnipotent char", !38, i64 0}
!38 = !{!"Simple C++ TBAA"}
!39 = !{!40}
!40 = distinct !{!40, !41, !"air-alias-scope-arg(0)"}
!41 = distinct !{!41, !"air-alias-scopes(add_arrays)"}
!42 = !{!43, !44}
!43 = distinct !{!43, !41, !"air-alias-scope-arg(1)"}
!44 = distinct !{!44, !41, !"air-alias-scope-arg(2)"}
!45 = !DILocation(line: 17, column: 35, scope: !33)
!46 = !{!43}
!47 = !{!40, !44}
!48 = !DILocation(line: 3790, column: 10, scope: !49, inlinedAt: !51)
!49 = distinct !DISubprogram(name: "max", scope: !50, file: !50, line: 3788, type: !34, scopeLine: 3789, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !27)
!50 = !DIFile(filename: "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/metal/macos/lib/clang/32023.35/include/metal/metal_integer", directory: "")
!51 = distinct !DILocation(line: 17, column: 19, scope: !33)
!52 = !DILocation(line: 17, column: 3, scope: !33)
!53 = !DILocation(line: 17, column: 17, scope: !33)
!54 = !{!44}
!55 = !{!40, !43}
!56 = !DILocation(line: 18, column: 1, scope: !33)

fixes JuliaGPU/Metal.jl#274

@maleadt
Copy link
Member

maleadt commented Jan 4, 2024

Can you check what Metal generates for vector intrinsics? air.max.v2s.i64 looks wrong (and the logic needs to be updated).

@codecov
Copy link

codecov bot commented Jan 4, 2024

Codecov Report

Attention: 1 lines in your changes are missing coverage. Please review.

Comparison is base (111685f) 84.60% compared to head (75d0516) 75.28%.

Files Patch % Lines
src/metal.jl 75.00% 1 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##           master     #538      +/-   ##
==========================================
- Coverage   84.60%   75.28%   -9.32%     
==========================================
  Files          24       24              
  Lines        3325     3298      -27     
==========================================
- Hits         2813     2483     -330     
- Misses        512      815     +303     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

@tgymnich tgymnich force-pushed the fix-intrinsic-lowering branch from 9fa7f77 to 7f8646a Compare January 4, 2024 14:41
@tgymnich
Copy link
Member Author

tgymnich commented Jan 4, 2024

@maleadt fixed

@tgymnich tgymnich force-pushed the fix-intrinsic-lowering branch from 7f8646a to 75d0516 Compare January 4, 2024 23:03
@maleadt maleadt merged commit 962b84e into JuliaGPU:master Jan 5, 2024
@maleadt
Copy link
Member

maleadt commented Jan 5, 2024

Thanks!

@tgymnich tgymnich deleted the fix-intrinsic-lowering branch January 5, 2024 15:24
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Error with Julia 1.10

2 participants