Skip to content
This repository was archived by the owner on Feb 25, 2025. It is now read-only.

Commit 842e92e

Browse files
Mike KleinSkia Commit-Bot
authored andcommitted
implement Op::bytes with NEON tbl
Starting to look good: 1.57ns SkVM_4096_I32_SWAR 3.89ns SkVM_4096_I32 4.08ns SkVM_4096_I32_Naive 7.84ns SkVM_4096_F32 2.7ns SkVM_4096_RP 1.26ns SkVM_4096_Opts Change-Id: I24f44ed5fc693a04afef8d7a6ce13f3733277fb3 Reviewed-on: https://skia-review.googlesource.com/c/skia/+/224280 Reviewed-by: Herb Derby <[email protected]> Commit-Queue: Mike Klein <[email protected]>
1 parent 135a9a5 commit 842e92e

2 files changed

Lines changed: 56 additions & 39 deletions

File tree

src/core/SkVM.cpp

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,8 @@ namespace skvm {
739739

740740
void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
741741

742+
void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
743+
742744
void Assembler::shift(uint32_t op, int imm, V n, V d) {
743745
this->word( (op & 22_mask) << 10
744746
| imm << 16 // imm is embedded inside op, bit size depends on op
@@ -831,6 +833,39 @@ namespace skvm {
831833
#endif
832834
}
833835

836+
// Just so happens that we can translate the immediate control for our bytes() op
837+
// to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl!
838+
static void bytes_control(int imm, int mask[4]) {
839+
auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
840+
// 0 -> 0xff, Fill with zero
841+
// 1 -> 0x00, Select byte 0
842+
// 2 -> 0x01, " 1
843+
// 3 -> 0x02, " 2
844+
// 4 -> 0x03, " 3
845+
return n - 1;
846+
};
847+
uint8_t control[] = {
848+
nibble_to_vpshufb( (imm >> 0) & 0xf ),
849+
nibble_to_vpshufb( (imm >> 4) & 0xf ),
850+
nibble_to_vpshufb( (imm >> 8) & 0xf ),
851+
nibble_to_vpshufb( (imm >> 12) & 0xf ),
852+
};
853+
for (int i = 0; i < 4; i++) {
854+
mask[i] = (int)control[0] << 0
855+
| (int)control[1] << 8
856+
| (int)control[2] << 16
857+
| (int)control[3] << 24;
858+
859+
// Update each byte that refers to a byte index by 4 to
860+
// point into the next 32-bit lane, but leave any 0xff
861+
// that fills with zero alone.
862+
control[0] += control[0] == 0xff ? 0 : 4;
863+
control[1] += control[1] == 0xff ? 0 : 4;
864+
control[2] += control[2] == 0xff ? 0 : 4;
865+
control[3] += control[3] == 0xff ? 0 : 4;
866+
}
867+
}
868+
834869
// Returns stride of the JIT, currently always 8.
835870
#if defined(__x86_64__)
836871
static int jit(Assembler& a, size_t* code,
@@ -867,49 +902,19 @@ namespace skvm {
867902
SkTHashMap<int, A::Label> vpshufb_masks;
868903
for (const Program::Instruction& inst : instructions) {
869904
if (inst.op == Op::bytes && vpshufb_masks.find(inst.imm) == nullptr) {
870-
// Translate bytes()'s control nibbles to vpshufb's control bytes.
871-
auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
872-
// 0 -> 0xff, Fill with zero
873-
// 1 -> 0x00, Select byte 0
874-
// 2 -> 0x01, " 1
875-
// 3 -> 0x02, " 2
876-
// 4 -> 0x03, " 3
877-
return n - 1;
878-
};
879-
uint8_t control[] = {
880-
nibble_to_vpshufb( (inst.imm >> 0) & 0xf ),
881-
nibble_to_vpshufb( (inst.imm >> 4) & 0xf ),
882-
nibble_to_vpshufb( (inst.imm >> 8) & 0xf ),
883-
nibble_to_vpshufb( (inst.imm >> 12) & 0xf ),
884-
};
885-
886905
// Now, vpshufb is one of those weird AVX instructions
887906
// that does everything in 2 128-bit chunks, so we'll
888-
// only really need 4 distinct values to write in our pattern:
889-
int p[4];
890-
for (int i = 0; i < 4; i++) {
891-
p[i] = (int)control[0] << 0
892-
| (int)control[1] << 8
893-
| (int)control[2] << 16
894-
| (int)control[3] << 24;
895-
896-
// Update each byte that refers to a byte index by 4 to
897-
// point into the next 32-bit lane, but leave any 0xff
898-
// that fills with zero alone.
899-
control[0] += control[0] == 0xff ? 0 : 4;
900-
control[1] += control[1] == 0xff ? 0 : 4;
901-
control[2] += control[2] == 0xff ? 0 : 4;
902-
control[3] += control[3] == 0xff ? 0 : 4;
903-
}
907+
// write the same mask pattern twice.
908+
int mask[4];
909+
bytes_control(inst.imm, mask);
904910

905911
// Notice, same pattern for top 4 32-bit lanes as bottom 4 lanes.
906912
SkASSERT(a.size() % 32 == 0);
907913
A::Label label = a.here();
908-
a.byte(p, sizeof(p));
909-
a.byte(p, sizeof(p));
914+
a.byte(mask, sizeof(mask));
915+
a.byte(mask, sizeof(mask));
910916
vpshufb_masks.set(inst.imm, label);
911917
}
912-
913918
}
914919

915920
// Map from splat bit pattern to 4-byte aligned data location holding that pattern.
@@ -1053,8 +1058,17 @@ namespace skvm {
10531058
};
10541059
const int tmp = 23; // i.e. v31
10551060

1056-
SkTHashMap<int, A::Label> splats;
1061+
SkTHashMap<int, A::Label> tbl_masks,
1062+
splats;
10571063
for (const Program::Instruction& inst : instructions) {
1064+
if (inst.op == Op::bytes && tbl_masks.find(inst.imm) == nullptr) {
1065+
int mask[4];
1066+
bytes_control(inst.imm, mask);
1067+
1068+
A::Label label = a.here();
1069+
a.byte(mask, sizeof(mask));
1070+
tbl_masks.set(inst.imm, label);
1071+
}
10581072
if (inst.op == Op::splat) {
10591073
A::Label label = a.here();
10601074
a.word(inst.imm);
@@ -1135,7 +1149,9 @@ namespace skvm {
11351149
case Op::to_f32: a.scvtf4s (r(d), r(x)); break;
11361150
case Op::to_i32: a.fcvtzs4s(r(d), r(x)); break;
11371151

1138-
case Op::bytes: TODO;
1152+
case Op::bytes: a.ldrq(r(tmp), *tbl_masks.find(imm)); // TODO: hoist instead of tmp
1153+
a.tbl (r(d), r(x), r(tmp));
1154+
break;
11391155
}
11401156
}
11411157

@@ -1207,7 +1223,7 @@ namespace skvm {
12071223
fJIT.mask = mask;
12081224

12091225

1210-
#if defined(SKVM_PERF_DUMPS) // Debug dumps for perf.
1226+
#if 0 || defined(SKVM_PERF_DUMPS) // Debug dumps for perf.
12111227
#if defined(__aarch64__)
12121228
// cat | llvm-mc -arch aarch64 -disassemble
12131229
auto cur = (const uint8_t*)buf;

src/core/SkVM.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ namespace skvm {
105105
DOpNM and16b, orr16b, eor16b, bic16b,
106106
add4s, sub4s, mul4s,
107107
sub8h, mul8h,
108-
fadd4s, fsub4s, fmul4s, fdiv4s;
108+
fadd4s, fsub4s, fmul4s, fdiv4s,
109+
tbl;
109110

110111
// d += n*m
111112
void fmla4s(V d, V n, V m);

0 commit comments

Comments
 (0)