From db38ad21308afe5e1f73dbe55ba261def7a46a1d Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Mon, 20 Mar 2023 16:37:17 +0100 Subject: [PATCH 1/4] SN_Sum operation on arm64. Fixed dup. Replaced addv, addp, faddp with their generalized variants. Added OP_EXTRACTx opcodes to arm64 codegen. Added horizontal sums. --- src/mono/mono/arch/arm64/arm64-codegen.h | 54 +++++++--------------- src/mono/mono/arch/arm64/codegen-test.c | 5 ++ src/mono/mono/mini/cpu-arm64.mdesc | 7 +++ src/mono/mono/mini/mini-arm64.c | 59 ++++++++++++++++++++++++ src/mono/mono/mini/simd-arm64.h | 3 ++ src/mono/mono/mini/simd-intrinsics.c | 24 +++++++--- 6 files changed, 107 insertions(+), 45 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index a6ea6c3faa3504..afc605bf98bfd6 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1122,15 +1122,12 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_ins_g(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, (((index) << 1) | 0b1) << (type), 0b0011, (rd), (rn)) #define arm_neon_ins_e(p, type, rd, rn, indexd, indexs) arm_neon_cpy_opcode ((p), 0b1, 0b1, (((indexd) << 1) | 0b1) << (type), (indexs) << (type), (rd), (rn)) -// Specific opcodes: -#define arm_neon_dup_e_8b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_16b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_4h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_8h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_2s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_4s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn)) -#define arm_neon_dup_e_2d(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 4), 0b0000, (rd), (rn)) +#define arm_neon_smov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << ((type)-1)) | ((index) << (type)), 0b0101, (rd), (rn)) +#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << ((type)-1)) | ((index) << (type)), 0b0111, (rd), (rn)) +#define arm_neon_dup_e(p, width, type, rd, rn, index) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)) | ((index) << ((type)+1)), 0b0000, (rd), (rn)) +#define arm_neon_fdup_e(p, width, type, rd, rn, index) arm_neon_dup_e ((p), (width), (type) + TYPE_I32, (rd), (rn), (index)) +// Specific opcodes: #define arm_neon_dup_g_8b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001, 0b0001, (rd), (rn)) #define arm_neon_dup_g_16b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001, 0b0001, (rd), (rn)) #define arm_neon_dup_g_4h(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010, 0b0001, (rd), (rn)) @@ -1139,19 +1136,15 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_dup_g_4s(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100, 0b0001, (rd), (rn)) #define arm_neon_dup_g_2d(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100, 0b0001, (rd), (rn)) -// the opcode is smov, but we define variants smovs and smovd by whether they fill a 32 or 64-bit reg. -#define arm_neon_smovs_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) -#define arm_neon_smovs_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smovd_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) -#define arm_neon_smovd_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smovd_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) +#define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) +#define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) +#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) +#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0101, (rd), (rn)) -// the opcode is umov, but we define variants smovs and smovd by whether they fill a 32 or 64-bit reg. -#define arm_neon_umovs_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) -#define arm_neon_umovs_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) -#define arm_neon_umovd_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) -#define arm_neon_umovd_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) -#define arm_neon_umovd_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) +#define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) +#define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) +#define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn)) +#define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0111, (rd), (rn)) /* NEON :: 3-register same FP16 */ // TODO @@ -1575,6 +1568,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) /* NEON :: across lanes */ #define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn)) +#define arm_neon_addv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b0, (type), 0b11011, (rd), (rn)) // contrary to most other opcodes, the suffix is the type of source #define arm_neon_saddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00011, (rd), (rn)) @@ -1595,12 +1589,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sminv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11010, (rd), (rn)) #define arm_neon_sminv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11010, (rd), (rn)) -#define arm_neon_addv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11011, (rd), (rn)) -#define arm_neon_addv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11011, (rd), (rn)) -#define arm_neon_addv_4h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b11011, (rd), (rn)) -#define arm_neon_addv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11011, (rd), (rn)) -#define arm_neon_addv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11011, (rd), (rn)) - // some fp16 opcodes here: fmaxnmv, fmaxv, fminnmv, fminv #define arm_neon_uaddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b00011, (rd), (rn)) @@ -1821,6 +1809,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_cmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b10001, (rd), (rn), (rm)) #define arm_neon_cmhi(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00110, (rd), (rn), (rm)) #define arm_neon_cmhs(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00111, (rd), (rn), (rm)) +#define arm_neon_addp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b10111, (rd), (rn), (rm)) // Generalized macros for float ops: // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} @@ -1834,6 +1823,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fcmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_fcmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11100, (rd), (rn), (rm)) #define arm_neon_fcmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11100, (rd), (rn), (rm)) +#define arm_neon_faddp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn), (rm)) // Generalized macros for bitwise ops: // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} @@ -2003,14 +1993,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sqdmulh_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm)) #define arm_neon_sqdmulh_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm)) -#define arm_neon_addp_8b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_16b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_4h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_8h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm)) -#define arm_neon_addp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b10111, (rd), (rn), (rm)) - #define arm_neon_fmaxnm_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm)) #define arm_neon_fmaxnm_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm)) #define arm_neon_fmaxnm_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11000, (rd), (rn), (rm)) @@ -2246,10 +2228,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fmaxnmp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11000, (rd), (rn), (rm)) #define arm_neon_fmaxnmp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11000, (rd), (rn), (rm)) -#define arm_neon_faddp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm)) -#define arm_neon_faddp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm)) -#define arm_neon_faddp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11010, (rd), (rn), (rm)) - #define arm_neon_fmul_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm)) #define arm_neon_fmul_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm)) #define arm_neon_fmul_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11011, (rd), (rn), (rm)) diff --git a/src/mono/mono/arch/arm64/codegen-test.c b/src/mono/mono/arch/arm64/codegen-test.c index b89adf007f2eea..b50a6432cf252a 100644 --- a/src/mono/mono/arch/arm64/codegen-test.c +++ b/src/mono/mono/arch/arm64/codegen-test.c @@ -477,6 +477,11 @@ main (int argc, char *argv []) arm_neon_ins_e (code, TYPE_I8, ARMREG_R0, ARMREG_R1, 1, 5); // insert v1.b[5] into v0.b[1] arm_neon_ins_e (code, TYPE_I32, ARMREG_R0, ARMREG_R1, 1, 2); // insert v1.s[2] into v0.s[1] + // pairwise and horizontal adds + arm_neon_addv (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1); + arm_neon_addp (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2); + arm_neon_faddp (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2); + for (i = 0; i < code - buf; ++i) printf (".byte %d\n", buf [i]); printf ("\n"); diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index a084543ebb5b2e..3953720c88575d 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -505,6 +505,13 @@ negate: dest:x src1:x len:4 ones_complement: dest:x src1:x len:4 xbinop_forceint: dest:x src1:x src2:x len:4 xcast: dest:x src1:x len:4 clob:1 +extract_i1: dest:i src1:x len:4 +extract_i2: dest:i src1:x len:4 +extract_i4: dest:i src1:x len:4 +extract_i8: dest:i src1:x len:4 +extract_r4: dest:f src1:x len:4 +extract_r8: dest:f src1:x len:4 +arm64_xaddv: dest:x src1:x len:8 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index d54b3137a0f77f..2c5dbcea23c8f7 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -27,6 +27,7 @@ #include #include #include +#include "llvm-intrinsics-types.h" #include "interp/interp.h" @@ -35,6 +36,7 @@ #define PARENTHESIZE(...) (__VA_ARGS__) #define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__)) #define OPFMT_WDSS _w, dreg, sreg1, sreg2 +#define OPFMT_WTDS _w, _t, dreg, sreg1 #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2 #define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1 #define _UNDEF(...) g_assert_not_reached () @@ -3445,6 +3447,12 @@ is_type_float_macro (MonoTypeEnum type) return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8); } +static gboolean +is_type_unsigned_macro (MonoTypeEnum type) +{ + return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8); +} + static int get_vector_size_macro (MonoInst *ins) { @@ -3715,6 +3723,57 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XCAST: break; + + + case OP_EXTRACT_I1: + case OP_EXTRACT_I2: + case OP_EXTRACT_I4: + case OP_EXTRACT_I8: { + const int t = get_type_size_macro (ins->inst_c1); + if (is_type_unsigned_macro (ins->inst_c1)) { + arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); + } else { + arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); + } + break; + } + case OP_EXTRACT_R4: + case OP_EXTRACT_R8: + if (ins->dreg != ins->sreg1 || ins->inst_c0 != 0) { + const int t = get_type_size_macro (ins->inst_c1); + // Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should + // set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest + // of the F/XREG is ignored in FREG mode, this operation remains valid. + arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0); + } + break; + case OP_ARM64_XADDV: { + switch (ins->inst_c0) { + case INTRINS_AARCH64_ADV_SIMD_FADDV: + if (ins->inst_c1 == MONO_TYPE_R8) { + arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1); + } else if (ins->inst_c1 == MONO_TYPE_R4) { + arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1); + arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg); + } else { + g_assert_not_reached (); + } + break; + + case INTRINS_AARCH64_ADV_SIMD_UADDV: + case INTRINS_AARCH64_ADV_SIMD_SADDV: + if (ins->inst_c1 == MONO_TYPE_I8 || ins->inst_c1 == MONO_TYPE_U8) + arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1); + else + g_assert_not_reached (); // remaining int types are handled through the codegen table + break; + + default: + g_assert_not_reached (); + } + break; + } + /* BRANCH */ case OP_BR: mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb, MONO_R_ARM64_B); diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h index 4af1a21892d4c7..21836b05103b12 100644 --- a/src/mono/mono/mini/simd-arm64.h +++ b/src/mono/mono/mini/simd-arm64.h @@ -62,3 +62,6 @@ SIMD_OP (128, OP_XBINOP, OP_FMIN, WTDSS, _UNDEF, SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_AND, WDSS, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and) SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_OR, WDSS, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr) SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_XOR, WDSS, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor) +SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_UADDV, WTDS, arm_neon_addv, arm_neon_addv, arm_neon_addv, _SKIP, _UNDEF, _UNDEF) +SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_SADDV, WTDS, arm_neon_addv, arm_neon_addv, arm_neon_addv, _SKIP, _UNDEF, _UNDEF) +SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, _SKIP, _SKIP) \ No newline at end of file diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 73121fc4a3668a..c3c42c2bd4c323 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -613,15 +613,23 @@ emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_t return ins; } - MonoInst *ins = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1); - + MonoInst *sum = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1); if (type_enum_is_float (element_type)) { - ins->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV; + sum->inst_c1 = element_type; } else { - ins->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV; + sum->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV; + sum->inst_c1 = element_type; } - return ins; + if (COMPILE_LLVM (cfg)) { + return sum; + } else { + MonoInst *ins = emit_simd_ins (cfg, vector_class, type_to_extract_op (element_type), sum->dreg, -1); + ins->inst_c0 = 0; + ins->inst_c1 = element_type; + return ins; + } } #endif #ifdef TARGET_WASM @@ -1191,8 +1199,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 #ifdef TARGET_ARM64 - if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) - return NULL; + //if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) + // return NULL; #endif int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod); @@ -1233,6 +1241,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_AsUInt64: case SN_Max: case SN_Min: + case SN_Sum: break; default: return NULL; @@ -1908,6 +1917,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign case SN_op_BitwiseAnd: case SN_op_BitwiseOr: case SN_op_ExclusiveOr: + case SN_Sum: break; default: return NULL; From f0c04e3c328e31ea21554a2e4112f93be7dcf5fa Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Tue, 21 Mar 2023 12:09:10 +0100 Subject: [PATCH 2/4] Fixed smov macro. Added SN_ToScalar. Fixed code style. --- src/mono/mono/arch/arm64/arm64-codegen.h | 7 +++---- src/mono/mono/mini/mini-arm64.c | 17 ++++++++--------- src/mono/mono/mini/simd-intrinsics.c | 2 +- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 53550913cb2759..2f4eb469b32e72 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1122,8 +1122,8 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_ins_g(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, (((index) << 1) | 0b1) << (type), 0b0011, (rd), (rn)) #define arm_neon_ins_e(p, type, rd, rn, indexd, indexs) arm_neon_cpy_opcode ((p), 0b1, 0b1, (((indexd) << 1) | 0b1) << (type), (indexs) << (type), (rd), (rn)) -#define arm_neon_smov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << ((type)-1)) | ((index) << (type)), 0b0101, (rd), (rn)) -#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << ((type)-1)) | ((index) << (type)), 0b0111, (rd), (rn)) +#define arm_neon_smov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I32) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0101, (rd), (rn)) +#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0111, (rd), (rn)) #define arm_neon_dup_e(p, width, type, rd, rn, index) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)) | ((index) << ((type)+1)), 0b0000, (rd), (rn)) #define arm_neon_fdup_e(p, width, type, rd, rn, index) arm_neon_dup_e ((p), (width), (type) + TYPE_I32, (rd), (rn), (index)) @@ -1138,8 +1138,7 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn)) #define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn)) -#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) -#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0101, (rd), (rn)) +#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn)) #define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn)) #define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn)) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index dbac61b7cfd921..41e61f98da4bab 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3744,28 +3744,27 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_XCAST: break; - - case OP_EXTRACT_I1: case OP_EXTRACT_I2: case OP_EXTRACT_I4: case OP_EXTRACT_I8: { const int t = get_type_size_macro (ins->inst_c1); - if (is_type_unsigned_macro (ins->inst_c1)) { + // smov is not defined for i64 + if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) { arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); } else { arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); - } + } break; } case OP_EXTRACT_R4: case OP_EXTRACT_R8: if (ins->dreg != ins->sreg1 || ins->inst_c0 != 0) { - const int t = get_type_size_macro (ins->inst_c1); - // Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should - // set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest - // of the F/XREG is ignored in FREG mode, this operation remains valid. - arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0); + const int t = get_type_size_macro (ins->inst_c1); + // Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should + // set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest + // of the F/XREG is ignored in FREG mode, this operation remains valid. + arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0); } break; case OP_ARM64_XADDV: { diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 82ad71acaa8120..06d3cfe024245a 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1259,6 +1259,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_Max: case SN_Min: case SN_Sum: + case SN_ToScalar: break; default: return NULL; @@ -1949,7 +1950,6 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign case SN_op_BitwiseAnd: case SN_op_BitwiseOr: case SN_op_ExclusiveOr: - case SN_Sum: case SN_op_Equality: case SN_op_Inequality: break; From 3359543321e4b112328db9cd946ae62f9434cb56 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Tue, 21 Mar 2023 13:59:12 +0100 Subject: [PATCH 3/4] Fixed vector sums of nint/nuint. --- src/mono/mono/mini/mini-arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 41e61f98da4bab..55c26e0e4f35eb 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3781,8 +3781,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; case INTRINS_AARCH64_ADV_SIMD_UADDV: - case INTRINS_AARCH64_ADV_SIMD_SADDV: - if (ins->inst_c1 == MONO_TYPE_I8 || ins->inst_c1 == MONO_TYPE_U8) + case INTRINS_AARCH64_ADV_SIMD_SADDV: + if (get_type_size_macro (ins->inst_c1) == TYPE_I64) arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1); else g_assert_not_reached (); // remaining int types are handled through the codegen table From 6561cb9b0a324c7bf0119750ea834893428a6dbf Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Tue, 21 Mar 2023 15:47:14 +0100 Subject: [PATCH 4/4] Temporarily disable intrinsics, until all are implemented. --- src/mono/mono/mini/simd-intrinsics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 06d3cfe024245a..168fff859c935c 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1206,8 +1206,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 #ifdef TARGET_ARM64 - //if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) - // return NULL; + if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) + return NULL; #endif int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod);