Skip to content

Commit e2afdaf

Browse files
committed
optimize crc64ecma_combine, and refactor crc32c_combine_sw
1 parent 9e7602c commit e2afdaf

File tree

1 file changed

+30
-54
lines changed

1 file changed

+30
-54
lines changed

common/checksum/crc.cpp

Lines changed: 30 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -575,20 +575,15 @@ uint32_t crc32c_combine_series_hw(uint32_t* crc, uint32_t part_size, uint32_t n_
575575
return res;
576576
}
577577

578-
static uint64_t bit_reverse32_64(uint32_t x) {
579-
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
580-
x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
581-
x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
582-
x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
583-
uint64_t x64 = (x >> 16) | (x << 16);
584-
return x64 << 32;
578+
inline uint64_t clmul(uint32_t a, uint32_t b) {
579+
uint64_t ret = 0, B = b;
580+
for (uint32_t i = 0; i < 32; ++i, a>>=1, B<<=1)
581+
if (a&1) ret ^= B;
582+
return ret;
585583
}
586584

587-
// do lshift with traditional scalar instructions
588585
static uint32_t do_crc32c_lshift_sw(uint32_t crc1, uint32_t x) {
589-
uint64_t q = 0, xrev = bit_reverse32_64(x);
590-
for (int i = 0; i < 64; i++, xrev >>= 1)
591-
q = (q << 1) | __builtin_parity(crc1 & xrev);
586+
auto q = clmul(crc1, x);
592587
return crc32c_sw((uint8_t*)&q, 8, 0);
593588
}
594589

@@ -863,43 +858,40 @@ uint64_t crc64ecma_hw_sse128(const uint8_t *buf, size_t len, uint64_t crc) {
863858

864859
__attribute__((aligned(16), used))
865860
const static uint64_t crc64ecma_lshift_table[] = {
866-
// by length of 32, 64, ..., 4G bytes
867-
0xe05dd497ca393ae4, 0xb5ea1af9c013aca4, 0x9e735cb59b4724da, 0x2ecbc6dd0447c685,
868-
0x15d325270d465dfe, 0x3f663335b446e329, 0x68a971ffad4f1766, 0xb7fd3098b8293475,
869-
0x1b225daef15ff37d, 0xea0ddceb7273a052, 0x620648e8f01b0134, 0xfba68785783fd770,
870-
0x4c0976fee55cc2f6, 0x2b4d20a2ca417feb, 0xf8a6eaabee9c15b0, 0x2068c6b926839fed,
871-
0xc8fef34a6a17f35a, 0x0de5614bc4140f08, 0x724fd734d6b83fac, 0x710def1593c89dfa,
872-
0x337733943ec752b9, 0x2a46a23b7286e04a, 0x5e7857ffbea8390a, 0xa2988a6572c32450,
873-
0xdb67937ee00fcea3, 0xb427cfd9f16baa88, 0x465d86031d8426b3, 0x56ebf2f895082092,
861+
// by length of 1, 2, 4, 8, ..., 4G bytes
862+
0x0100000000000000, 0x0001000000000000, 0x0000000100000000, 0x0000000000000001,
863+
0xdabe95afc7875f40, 0x3be653a30fe1af51, 0x081f6054a7842df4, 0xd7d86b2af73de740,
864+
0xf31fd9271e228b79, 0x430af18f45bfec70, 0xdb77241e49bab9e4, 0xf42819f9b69abd6a,
865+
0x8366e0bd97880af6, 0x3a0dc386f69b9d51, 0x6537db4df869f6e6, 0xa349f9a86a172f2e,
866+
0x629f50ac6fdf5d3a, 0xcb3c521f853fb4a1, 0x6d1de95abb95074b, 0x9172d2fcc1985c9c,
867+
0xc996d644b4a25645, 0x7d4a8a9b19cb8376, 0xbd09c74d8e1fd5e2, 0xb3223c4776751d27,
868+
0x57f0d333946ab755, 0x4b2c7a963a1e77c7, 0x78ff6b0f1d73d8c7, 0x740d65cdcd060ca9,
869+
0xc7f7a66770f6ce2e, 0xe8195ee90ae40993, 0xd9443a2e9c8cb27f, 0x17a88be197b6abdc,
870+
0x6975abb7ef289b8f,
874871
};
875872

876873
// virtually pad `len2` bytes of 0 to source
877874
// data, and return resulting crc value
878-
// `len2` must be >= 32
879-
static uint64_t crc64ecma_lshift_big(uint64_t crc1, uint32_t len2,
880-
uint64_t (*shift)(uint64_t crc1, uint64_t x)) {
881-
for (len2 >>= 5; len2; len2 &= len2 - 1) {
875+
static uint64_t do_crc64ecma_combine(uint64_t crc1, uint64_t crc2,
876+
uint32_t len2, uint64_t (*shift)(uint64_t crc1, uint64_t x)) {
877+
if (unlikely(!crc1)) return crc2;
878+
if (unlikely(!len2)) return crc1;
879+
for (; len2; len2 &= len2 - 1) {
882880
auto x = crc64ecma_lshift_table[__builtin_ctz(len2)];
883881
crc1 = shift(crc1, x);
884882
}
885-
return crc1;
883+
return crc1 ^ crc2;
886884
}
887885

888886
static uint64_t do_crc64ecma_lshift_hw(uint64_t crc, uint64_t x) {
889887
__m128i crc1x, crc2x, crc3x, constx;
890-
const __m128i rk5 = _mm_loadl_epi64((__m128i*)&rk[5-1]);
891888
const __m128i rk7 = _mm_loadu_si128((__m128i*)&rk[7-1]);
892889

893890
crc1x = _mm_cvtsi64_si128(crc);
894891
constx = _mm_cvtsi64_si128(x);
895892
crc1x = _mm_clmulepi64_si128(crc1x, constx, 0x00);
896893

897-
// Fold to 64b
898-
crc2x = _mm_clmulepi64_si128(crc1x, rk5, 0x00);
899-
crc3x = _mm_bsrli_si128(crc1x, 8);
900-
crc1x = _mm_xor_si128(crc2x, crc3x);
901-
902-
// Reduce
894+
// Barrett Reduce
903895
crc2x = _mm_clmulepi64_si128(crc1x, rk7, 0x00);
904896
crc3x = _mm_clmulepi64_si128(crc2x, rk7, 0x10);
905897
crc2x = _mm_bslli_si128(crc2x, 8);
@@ -909,41 +901,25 @@ static uint64_t do_crc64ecma_lshift_hw(uint64_t crc, uint64_t x) {
909901
}
910902

911903
uint64_t crc64ecma_combine_hw(uint64_t crc1, uint64_t crc2, uint32_t len2) {
912-
if (unlikely(!crc1)) return crc2;
913-
if (unlikely(!len2)) return crc1;
914-
if (unlikely(len2 & 31)) {
915-
crc1 = ~crc64ecma_hw(zeros, len2 & 31, ~crc1);
916-
}
917-
crc1 = crc64ecma_lshift_big(crc1, len2, do_crc64ecma_lshift_hw);
918-
return crc1 ^ crc2;
904+
return do_crc64ecma_combine(crc1, crc2, len2, do_crc64ecma_lshift_hw);
919905
}
920906

921907
inline __uint128_t clmul(uint64_t a, uint64_t b) {
922-
__uint128_t ret = 0;
923-
for (uint32_t i = 0; i < 64; ++i)
924-
if ((a >> i) & 1)
925-
ret ^= (__uint128_t)b << i;
908+
__uint128_t ret = 0, B = b;
909+
for (uint32_t i = 0; i < 64; ++i, a>>=1, B<<=1)
910+
if (a&1) ret ^= B;
926911
return ret;
927912
}
928913

929914
static uint64_t do_crc64ecma_lshift_sw(uint64_t crc, uint64_t x) {
930915
__uint128_t crc1x = clmul(crc, x);
931-
__uint128_t crc2x = clmul(crc1x, rk[5-1]);
932-
__uint128_t crc3x = crc1x >> 64;
933-
crc1x = crc2x ^ crc3x;
934-
crc2x = clmul(crc1x, rk[7-1]);
935-
crc3x = clmul(crc2x, rk[7]);
916+
__uint128_t crc2x = clmul(crc1x, rk[7-1]);
917+
__uint128_t crc3x = clmul(crc2x, rk[7]);
936918
return (crc1x >> 64) ^ crc2x ^ (crc3x >> 64);
937919
}
938920

939921
uint64_t crc64ecma_combine_sw(uint64_t crc1, uint64_t crc2, uint32_t len2) {
940-
if (unlikely(!crc1)) return crc2;
941-
if (unlikely(!len2)) return crc1;
942-
if (unlikely(len2 & 31)) {
943-
crc1 = ~crc64ecma_sw(zeros, len2 & 31, ~crc1);
944-
}
945-
crc1 = crc64ecma_lshift_big(crc1, len2, do_crc64ecma_lshift_sw);
946-
return crc1 ^ crc2;
922+
return do_crc64ecma_combine(crc1, crc2, len2, do_crc64ecma_lshift_sw);
947923
}
948924

949925
#ifdef __x86_64__

0 commit comments

Comments
 (0)