diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index aacc456bd5431..7d43ba81339e0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20937,6 +20937,53 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (trunc X to <3 x i8>) to sequence of ST1.b. +static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Value = ST->getValue(); + EVT ValueVT = Value.getValueType(); + + if (ST->isVolatile() || !Subtarget->isLittleEndian() || + Value.getOpcode() != ISD::TRUNCATE || + ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3)) + return SDValue(); + + assert(ST->getOffset().isUndef() && "undef offset expected"); + SDLoc DL(ST); + auto WideVT = EVT::getVectorVT( + *DAG.getContext(), + Value->getOperand(0).getValueType().getVectorElementType(), 4); + SDValue UndefVector = DAG.getUNDEF(WideVT); + SDValue WideTrunc = DAG.getNode( + ISD::INSERT_SUBVECTOR, DL, WideVT, + {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)}); + SDValue Cast = DAG.getNode( + ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8, + WideTrunc); + + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = ST->getChain(); + MachineMemOperand *MMO = ST->getMemOperand(); + unsigned IdxScale = WideVT.getScalarSizeInBits() / 8; + SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(2 * IdxScale, DL, MVT::i64)); + TypeSize Offset2 = TypeSize::getFixed(2); + SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL); + Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1)); + + SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(1 * IdxScale, DL, MVT::i64)); + TypeSize Offset1 = TypeSize::getFixed(1); + SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL); + Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1)); + + SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast, + DAG.getConstant(0, DL, MVT::i64)); + Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), + MF.getMachineMemOperand(MMO, 0, 1)); + return Chain; +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -20952,6 +20999,9 @@ static SDValue performSTORECombine(SDNode *N, return EltVT == MVT::f32 || EltVT == MVT::f64; }; + if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget)) + return Res; + // If this is an FP_ROUND followed by a store, fold this into a truncating // store. We can do this even if this is already a truncstore. // We purposefully don't care about legality of the nodes here as we know diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll new file mode 100644 index 0000000000000..2ec6ae9d99389 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -0,0 +1,801 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=arm64-apple-macosx -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck --check-prefix BE %s + +define <16 x i8> @load_v3i8(ptr %src) { +; CHECK-LABEL: load_v3i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: umov.h w8, v0[0] +; CHECK-NEXT: umov.h w9, v0[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: add x8, x0, #2 +; CHECK-NEXT: mov.b v0[1], w9 +; CHECK-NEXT: ld1.b { v0 }[2], [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i8: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldrh w8, [x0] +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: umov w8, v0.h[0] +; BE-NEXT: umov w9, v0.h[1] +; BE-NEXT: fmov s0, w8 +; BE-NEXT: add x8, x0, #2 +; BE-NEXT: mov v0.b[1], w9 +; BE-NEXT: ld1 { v0.b }[2], [x8] +; BE-NEXT: rev64 v0.16b, v0.16b +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i8>, ptr %src, align 1 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <16 x i32> + ret <16 x i8> %s +} + +define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { +; CHECK-LABEL: load_v3i8_to_4xi32: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldrsb w8, [x0, #2] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: mov.h v0[1], v0[1] +; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i8_to_4xi32: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldrh w8, [x0] +; BE-NEXT: movi v1.2d, #0x0000ff000000ff +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: ldrsb w8, [x0, #2] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] +; BE-NEXT: mov v0.h[2], w8 +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: and v0.16b, v0.16b, v1.16b +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i8>, ptr %src, align 1 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> + %e = zext <4 x i8> %s to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { +; CHECK-LABEL: load_v3i8_to_4xi32_align_2: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldrsb w8, [x0, #2] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: mov.h v0[1], v0[1] +; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i8_to_4xi32_align_2: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldrh w8, [x0] +; BE-NEXT: movi v1.2d, #0x0000ff000000ff +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: ldrsb w8, [x0, #2] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] +; BE-NEXT: mov v0.h[2], w8 +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: and v0.16b, v0.16b, v1.16b +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i8>, ptr %src, align 2 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> + %e = zext <4 x i8> %s to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) { +; CHECK-LABEL: load_v3i8_to_4xi32_align_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff +; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i8_to_4xi32_align_4: +; BE: // %bb.0: +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: movi v1.2d, #0x0000ff000000ff +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b +; BE-NEXT: rev16 v0.8b, v0.8b +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: and v0.16b, v0.16b, v1.16b +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: ret + %l = load <3 x i8>, ptr %src, align 4 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> + %e = zext <4 x i8> %s to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { +; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldrsb w8, [x0, #3] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: mov.h v0[1], v0[1] +; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i8_to_4xi32_const_offset_1: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldurh w8, [x0, #1] +; BE-NEXT: movi v1.2d, #0x0000ff000000ff +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: ldrsb w8, [x0, #3] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] +; BE-NEXT: mov v0.h[2], w8 +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: and v0.16b, v0.16b, v1.16b +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %src.1 = getelementptr inbounds i8, ptr %src, i64 1 + %l = load <3 x i8>, ptr %src.1, align 1 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> + %e = zext <4 x i8> %s to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { +; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldurh w8, [x0, #3] +; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldrsb w8, [x0, #5] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: mov.h v0[1], v0[1] +; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i8_to_4xi32_const_offset_3: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldurh w8, [x0, #3] +; BE-NEXT: movi v1.2d, #0x0000ff000000ff +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: ldrsb w8, [x0, #5] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] +; BE-NEXT: mov v0.h[2], w8 +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: and v0.16b, v0.16b, v1.16b +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %src.3 = getelementptr inbounds i8, ptr %src, i64 3 + %l = load <3 x i8>, ptr %src.3, align 1 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> + %e = zext <4 x i8> %s to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { +; check-label: volatile_load_v3i8_to_4xi32: +; check: ; %bb.0: +; check-next: sub sp, sp, #16 +; check-next: .cfi_def_cfa_offset 16 +; check-next: ldrh w8, [x0] +; check-next: movi.2d v1, #0x0000ff000000ff +; check-next: strh w8, [sp, #12] +; check-next: ldr s0, [sp, #12] +; check-next: ldrsb w8, [x0, #2] +; check-next: ushll.8h v0, v0, #0 +; check-next: mov.h v0[1], v0[1] +; check-next: mov.h v0[2], w8 +; check-next: ushll.4s v0, v0, #0 +; check-next: and.16b v0, v0, v1 +; check-next: add sp, sp, #16 +; check-next: ret +; +; be-label: volatile_load_v3i8_to_4xi32: +; be: // %bb.0: +; be-next: sub sp, sp, #16 +; be-next: .cfi_def_cfa_offset 16 +; be-next: ldrh w8, [x0] +; be-next: movi v1.2d, #0x0000ff000000ff +; be-next: strh w8, [sp, #12] +; be-next: ldr s0, [sp, #12] +; be-next: ldrsb w8, [x0, #2] +; be-next: rev32 v0.8b, v0.8b +; be-next: ushll v0.8h, v0.8b, #0 +; be-next: mov v0.h[1], v0.h[1] +; be-next: mov v0.h[2], w8 +; be-next: ushll v0.4s, v0.4h, #0 +; be-next: and v0.16b, v0.16b, v1.16b +; be-next: rev64 v0.4s, v0.4s +; be-next: ext v0.16b, v0.16b, v0.16b, #8 +; be-next: add sp, sp, #16 +; be-next: ret +; CHECK-LABEL: volatile_load_v3i8_to_4xi32: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ldrsb w8, [x0, #2] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: mov.h v0[1], v0[1] +; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: volatile_load_v3i8_to_4xi32: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldrh w8, [x0] +; BE-NEXT: movi v1.2d, #0x0000ff000000ff +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: ldrsb w8, [x0, #2] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] +; BE-NEXT: mov v0.h[2], w8 +; BE-NEXT: ushll v0.4s, v0.4h, #0 +; BE-NEXT: and v0.16b, v0.16b, v1.16b +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load volatile <3 x i8>, ptr %src, align 1 + %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> + %e = zext <4 x i8> %s to <4 x i32> + ret <4 x i32> %e +} + +define <3 x i32> @load_v3i32(ptr %src) { +; CHECK-LABEL: load_v3i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: add x8, x0, #8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ld1.s { v0 }[2], [x8] +; CHECK-NEXT: ret +; +; BE-LABEL: load_v3i32: +; BE: // %bb.0: +; BE-NEXT: ldr d0, [x0] +; BE-NEXT: add x8, x0, #8 +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ld1 { v0.s }[2], [x8] +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src, align 1 + ret <3 x i32> %l +} + +define void @store_trunc_from_64bits(ptr %src, ptr %dst) { +; CHECK-LABEL: store_trunc_from_64bits: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: add x10, x1, #1 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: strb w9, [x1] +; CHECK-NEXT: st1.b { v1 }[2], [x10] +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: ret +; +; BE-LABEL: store_trunc_from_64bits: +; BE: // %bb.0: // %entry +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: ldrh w8, [x0, #4] +; BE-NEXT: rev32 v0.4h, v0.4h +; BE-NEXT: mov v0.h[2], w8 +; BE-NEXT: xtn v0.8b, v0.8h +; BE-NEXT: rev32 v0.16b, v0.16b +; BE-NEXT: str s0, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #2] +; BE-NEXT: strh w9, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret +entry: + %l = load <3 x i16>, ptr %src, align 1 + %t = trunc <3 x i16> %l to <3 x i8> + store <3 x i8> %t, ptr %dst, align 1 + ret void +} + +define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) { +; CHECK-LABEL: store_trunc_add_from_64bits: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x9, lCPI9_0@PAGE +; CHECK-NEXT: ld1.h { v0 }[2], [x8] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr d1, [x9, lCPI9_0@PAGEOFF] +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: add.4h v0, v0, v1 +; CHECK-NEXT: st1.b { v0 }[2], [x8] +; CHECK-NEXT: st1.b { v0 }[4], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 +; +; BE-LABEL: store_trunc_add_from_64bits: +; BE: // %bb.0: // %entry +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: add x8, x0, #4 +; BE-NEXT: adrp x9, .LCPI9_0 +; BE-NEXT: add x9, x9, :lo12:.LCPI9_0 +; BE-NEXT: rev32 v0.4h, v0.4h +; BE-NEXT: ld1 { v1.4h }, [x9] +; BE-NEXT: ld1 { v0.h }[2], [x8] +; BE-NEXT: add v0.4h, v0.4h, v1.4h +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #2] +; BE-NEXT: strh w9, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret +entry: + %l = load <3 x i16>, ptr %src, align 1 + %a = add <3 x i16> %l, + %t = trunc <3 x i16> %a to <3 x i8> + store <3 x i8> %t, ptr %dst, align 1 + ret void +} + +define void @load_ext_to_64bits(ptr %src, ptr %dst) { +; CHECK-LABEL: load_ext_to_64bits: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: add x8, x0, #2 +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: load_ext_to_64bits: +; BE: // %bb.0: // %entry +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldrh w8, [x0] +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: add x8, x0, #2 +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: ld1 { v0.b }[4], [x8] +; BE-NEXT: add x8, x1, #4 +; BE-NEXT: bic v0.4h, #255, lsl #8 +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: st1 { v0.h }[2], [x8] +; BE-NEXT: str s1, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret +entry: + %l = load <3 x i8>, ptr %src, align 1 + %e = zext <3 x i8> %l to <3 x i16> + store <3 x i16> %e, ptr %dst, align 1 + ret void +} + +define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) { +; CHECK-LABEL: load_ext_to_64bits_default_align: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: ret +; +; BE-LABEL: load_ext_to_64bits_default_align: +; BE: // %bb.0: // %entry +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: add x8, x1, #4 +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b +; BE-NEXT: rev16 v0.8b, v0.8b +; BE-NEXT: bic v0.4h, #255, lsl #8 +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: st1 { v0.h }[2], [x8] +; BE-NEXT: str s1, [x1] +; BE-NEXT: ret +entry: + %l = load <3 x i8>, ptr %src + %e = zext <3 x i8> %l to <3 x i16> + store <3 x i16> %e, ptr %dst, align 1 + ret void +} + +define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) { +; CHECK-LABEL: load_ext_to_64bits_align_4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: ret +; +; BE-LABEL: load_ext_to_64bits_align_4: +; BE: // %bb.0: // %entry +; BE-NEXT: ldr s0, [x0] +; BE-NEXT: add x8, x1, #4 +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b +; BE-NEXT: rev16 v0.8b, v0.8b +; BE-NEXT: bic v0.4h, #255, lsl #8 +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: st1 { v0.h }[2], [x8] +; BE-NEXT: str s1, [x1] +; BE-NEXT: ret +entry: + %l = load <3 x i8>, ptr %src, align 4 + %e = zext <3 x i8> %l to <3 x i16> + store <3 x i16> %e, ptr %dst, align 1 + ret void +} + +define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { +; CHECK-LABEL: load_ext_add_to_64bits: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x9, lCPI13_0@PAGE +; CHECK-NEXT: strh w8, [sp, #12] +; CHECK-NEXT: add x8, x0, #2 +; CHECK-NEXT: ldr s0, [sp, #12] +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: ldr d1, [x9, lCPI13_0@PAGEOFF] +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: add.4h v0, v0, v1 +; CHECK-NEXT: st1.h { v0 }[2], [x8] +; CHECK-NEXT: str s0, [x1] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 +; +; BE-LABEL: load_ext_add_to_64bits: +; BE: // %bb.0: // %entry +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ldrh w8, [x0] +; BE-NEXT: strh w8, [sp, #12] +; BE-NEXT: add x8, x0, #2 +; BE-NEXT: ldr s0, [sp, #12] +; BE-NEXT: rev32 v0.8b, v0.8b +; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: ld1 { v0.b }[4], [x8] +; BE-NEXT: adrp x8, .LCPI13_0 +; BE-NEXT: add x8, x8, :lo12:.LCPI13_0 +; BE-NEXT: ld1 { v1.4h }, [x8] +; BE-NEXT: bic v0.4h, #255, lsl #8 +; BE-NEXT: add x8, x1, #4 +; BE-NEXT: add v0.4h, v0.4h, v1.4h +; BE-NEXT: rev32 v1.8h, v0.8h +; BE-NEXT: st1 { v0.h }[2], [x8] +; BE-NEXT: str s1, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret +entry: + %l = load <3 x i8>, ptr %src, align 1 + %e = zext <3 x i8> %l to <3 x i16> + %a = add <3 x i16> %e, + store <3 x i16> %a, ptr %dst, align 1 + ret void +} + +define void @shift_trunc_store(ptr %src, ptr %dst) { +; CHECK-LABEL: shift_trunc_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: ret +; +; BE-LABEL: shift_trunc_store: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ld1 { v0.4s }, [x0] +; BE-NEXT: shrn v0.4h, v0.4s, #16 +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #2] +; BE-NEXT: strh w9, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src + %s = lshr <3 x i32> %l, + %t = trunc <3 x i32> %s to <3 x i8> + store <3 x i8> %t, ptr %dst, align 1 + ret void +} + +define void @shift_trunc_store_default_align(ptr %src, ptr %dst) { +; CHECK-LABEL: shift_trunc_store_default_align: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: ret +; +; BE-LABEL: shift_trunc_store_default_align: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ld1 { v0.4s }, [x0] +; BE-NEXT: shrn v0.4h, v0.4s, #16 +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #2] +; BE-NEXT: strh w9, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src + %s = lshr <3 x i32> %l, + %t = trunc <3 x i32> %s to <3 x i8> + store <3 x i8> %t, ptr %dst + ret void +} + +define void @shift_trunc_store_align_4(ptr %src, ptr %dst) { +; CHECK-LABEL: shift_trunc_store_align_4: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x1, #1 +; CHECK-NEXT: add x9, x1, #2 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x1] +; CHECK-NEXT: ret +; +; BE-LABEL: shift_trunc_store_align_4: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ld1 { v0.4s }, [x0] +; BE-NEXT: shrn v0.4h, v0.4s, #16 +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #2] +; BE-NEXT: strh w9, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src + %s = lshr <3 x i32> %l, + %t = trunc <3 x i32> %s to <3 x i8> + store <3 x i8> %t, ptr %dst, align 4 + ret void +} + +define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) { +; CHECK-LABEL: shift_trunc_store_const_offset_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: add x9, x1, #3 +; CHECK-NEXT: add x10, x1, #1 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x10] +; CHECK-NEXT: ret +; +; BE-LABEL: shift_trunc_store_const_offset_1: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ld1 { v0.4s }, [x0] +; BE-NEXT: shrn v0.4h, v0.4s, #16 +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #3] +; BE-NEXT: sturh w9, [x1, #1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src + %s = lshr <3 x i32> %l, + %t = trunc <3 x i32> %s to <3 x i8> + %dst.1 = getelementptr inbounds i8, ptr %dst, i64 1 + store <3 x i8> %t, ptr %dst.1, align 1 + ret void +} + +define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) { +; CHECK-LABEL: shift_trunc_store_const_offset_3: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: add x9, x1, #5 +; CHECK-NEXT: add x10, x1, #3 +; CHECK-NEXT: ushr.4s v0, v0, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] +; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[0], [x10] +; CHECK-NEXT: ret +; +; BE-LABEL: shift_trunc_store_const_offset_3: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ld1 { v0.4s }, [x0] +; BE-NEXT: shrn v0.4h, v0.4s, #16 +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #5] +; BE-NEXT: sturh w9, [x1, #3] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src + %s = lshr <3 x i32> %l, + %t = trunc <3 x i32> %s to <3 x i8> + %dst.3 = getelementptr inbounds i8, ptr %dst, i64 3 + store <3 x i8> %t, ptr %dst.3, align 1 + ret void +} + +define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { +; CHECK-LABEL: shift_trunc_volatile_store: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: shrn.4h v0, v0, #16 +; CHECK-NEXT: xtn.8b v1, v0 +; CHECK-NEXT: umov.h w8, v0[2] +; CHECK-NEXT: str s1, [sp, #12] +; CHECK-NEXT: ldrh w9, [sp, #12] +; CHECK-NEXT: strb w8, [x1, #2] +; CHECK-NEXT: strh w9, [x1] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret +; +; BE-LABEL: shift_trunc_volatile_store: +; BE: // %bb.0: +; BE-NEXT: sub sp, sp, #16 +; BE-NEXT: .cfi_def_cfa_offset 16 +; BE-NEXT: ld1 { v0.4s }, [x0] +; BE-NEXT: shrn v0.4h, v0.4s, #16 +; BE-NEXT: xtn v1.8b, v0.8h +; BE-NEXT: umov w8, v0.h[2] +; BE-NEXT: rev32 v1.16b, v1.16b +; BE-NEXT: str s1, [sp, #12] +; BE-NEXT: ldrh w9, [sp, #12] +; BE-NEXT: strb w8, [x1, #2] +; BE-NEXT: strh w9, [x1] +; BE-NEXT: add sp, sp, #16 +; BE-NEXT: ret + %l = load <3 x i32>, ptr %src + %s = lshr <3 x i32> %l, + %t = trunc <3 x i32> %s to <3 x i8> + store volatile <3 x i8> %t, ptr %dst, align 1 + ret void +}