@@ -2968,8 +2968,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
29682968 }
29692969 }
29702970
2971- mlir::cir::VectorType Ty = GetNeonType (this , Type);
2972- if (!Ty )
2971+ mlir::cir::VectorType ty = GetNeonType (this , Type);
2972+ if (!ty )
29732973 return nullptr ;
29742974
29752975 // Not all intrinsics handled by the common case work for AArch64 yet, so only
@@ -2986,7 +2986,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
29862986 buildAArch64TblBuiltinExpr (*this , BuiltinID, E, Ops, Arch))
29872987 return V;
29882988
2989- mlir::cir::VectorType VTy = Ty ;
2989+ mlir::cir::VectorType vTy = ty ;
29902990 llvm::SmallVector<mlir::Value, 4 > args;
29912991 switch (BuiltinID) {
29922992 default :
@@ -3066,8 +3066,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
30663066 // https://developer.arm.com/architectures/instruction-sets/intrinsics/
30673067 return buildNeonCall (
30683068 BuiltinID, *this ,
3069- {builder.getExtendedElementVectorType (Ty , true ), SInt32Ty}, Ops,
3070- " llvm.aarch64.neon.sqrshrun" , Ty , getLoc (E->getExprLoc ()));
3069+ {builder.getExtendedElementVectorType (ty , true ), SInt32Ty}, Ops,
3070+ " llvm.aarch64.neon.sqrshrun" , ty , getLoc (E->getExprLoc ()));
30713071 case NEON::BI__builtin_neon_vqshrn_n_v:
30723072 llvm_unreachable (" NYI" );
30733073 case NEON::BI__builtin_neon_vrshrn_n_v:
@@ -3080,7 +3080,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
30803080 case NEON::BI__builtin_neon_vrnda_v:
30813081 case NEON::BI__builtin_neon_vrndaq_v: {
30823082 assert (!MissingFeatures::buildConstrainedFPCall ());
3083- return buildNeonCall (BuiltinID, *this , {Ty }, Ops, " llvm.round" , Ty ,
3083+ return buildNeonCall (BuiltinID, *this , {ty }, Ops, " llvm.round" , ty ,
30843084 getLoc (E->getExprLoc ()));
30853085 }
30863086 case NEON::BI__builtin_neon_vrndih_f16: {
@@ -3407,20 +3407,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
34073407 }
34083408 case NEON::BI__builtin_neon_vld1_v:
34093409 case NEON::BI__builtin_neon_vld1q_v: {
3410- return builder.createAlignedLoad (Ops[0 ].getLoc (), VTy , Ops[0 ],
3410+ return builder.createAlignedLoad (Ops[0 ].getLoc (), vTy , Ops[0 ],
34113411 PtrOp0.getAlignment ());
34123412 }
34133413 case NEON::BI__builtin_neon_vst1_v:
34143414 case NEON::BI__builtin_neon_vst1q_v: {
3415- Ops[1 ] = builder.createBitcast (Ops[1 ], VTy );
3415+ Ops[1 ] = builder.createBitcast (Ops[1 ], vTy );
34163416 (void )builder.createAlignedStore (Ops[1 ].getLoc (), Ops[1 ], Ops[0 ],
34173417 PtrOp0.getAlignment ());
34183418 return Ops[1 ];
34193419 }
34203420 case NEON::BI__builtin_neon_vld1_lane_v:
34213421 case NEON::BI__builtin_neon_vld1q_lane_v: {
3422- Ops[1 ] = builder.createBitcast (Ops[1 ], VTy );
3423- Ops[0 ] = builder.createAlignedLoad (Ops[0 ].getLoc (), VTy .getEltType (),
3422+ Ops[1 ] = builder.createBitcast (Ops[1 ], vTy );
3423+ Ops[0 ] = builder.createAlignedLoad (Ops[0 ].getLoc (), vTy .getEltType (),
34243424 Ops[0 ], PtrOp0.getAlignment ());
34253425 return builder.create <mlir::cir::VecInsertOp>(getLoc (E->getExprLoc ()),
34263426 Ops[1 ], Ops[0 ], Ops[2 ]);
@@ -3435,7 +3435,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
34353435 }
34363436 case NEON::BI__builtin_neon_vst1_lane_v:
34373437 case NEON::BI__builtin_neon_vst1q_lane_v: {
3438- Ops[1 ] = builder.createBitcast (Ops[1 ], Ty );
3438+ Ops[1 ] = builder.createBitcast (Ops[1 ], ty );
34393439 Ops[1 ] = builder.create <mlir::cir::VecExtractOp>(Ops[1 ].getLoc (), Ops[1 ],
34403440 Ops[2 ]);
34413441 (void )builder.createAlignedStore (getLoc (E->getExprLoc ()), Ops[1 ], Ops[0 ],
@@ -3508,7 +3508,41 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
35083508 }
35093509 case NEON::BI__builtin_neon_vtrn_v:
35103510 case NEON::BI__builtin_neon_vtrnq_v: {
3511- llvm_unreachable (" NYI" );
3511+ // This set of neon intrinsics implement SIMD matrix transpose.
3512+ // The matrix transposed is always 2x2, and these intrincis transpose
3513+ // multiple 2x2 matrices in parallel, that is why result type is
3514+ // always 2-D matrix whose last dimension is 2.
3515+ // For example `vtrn_s16` would have:
3516+ // input 1: {0, 1, 2, 3}
3517+ // input 2; {4, 5, 6, 7}
3518+ // This basically represents two 2x2 matrices:
3519+ // [ 0, 1 ] and [ 2, 3]
3520+ // [ 4, 5 ] [ 6, 7]
3521+ // They should be simultaneously and independently transposed.
3522+ // Thus, result is :
3523+ // { {0, 4, 2, 6},
3524+ // {1, 5, 3, 7 } }
3525+ Ops[1 ] = builder.createBitcast (Ops[1 ], ty);
3526+ Ops[2 ] = builder.createBitcast (Ops[2 ], ty);
3527+ // Adding a bitcast here as Ops[0] might be a void pointer.
3528+ mlir::Value baseAddr =
3529+ builder.createBitcast (Ops[0 ], builder.getPointerTo (ty));
3530+ mlir::Value sv;
3531+ mlir::Location loc = getLoc (E->getExprLoc ());
3532+
3533+ for (unsigned vi = 0 ; vi != 2 ; ++vi) {
3534+ llvm::SmallVector<int64_t , 16 > indices;
3535+ for (unsigned i = 0 , e = vTy.getSize (); i != e; i += 2 ) {
3536+ indices.push_back (i + vi);
3537+ indices.push_back (i + e + vi);
3538+ }
3539+ mlir::cir::ConstantOp idx = builder.getConstInt (loc, PtrDiffTy, vi);
3540+ mlir::Value addr = builder.create <mlir::cir::PtrStrideOp>(
3541+ loc, baseAddr.getType (), baseAddr, idx);
3542+ sv = builder.createVecShuffle (loc, Ops[1 ], Ops[2 ], indices);
3543+ (void )builder.CIRBaseBuilderTy ::createStore (loc, sv, addr);
3544+ }
3545+ return sv;
35123546 }
35133547 case NEON::BI__builtin_neon_vuzp_v:
35143548 case NEON::BI__builtin_neon_vuzpq_v: {
0 commit comments