@@ -575,20 +575,15 @@ uint32_t crc32c_combine_series_hw(uint32_t* crc, uint32_t part_size, uint32_t n_
575575 return res;
576576}
577577
578- static uint64_t bit_reverse32_64 (uint32_t x) {
579- x = (((x & 0xaaaaaaaa ) >> 1 ) | ((x & 0x55555555 ) << 1 ));
580- x = (((x & 0xcccccccc ) >> 2 ) | ((x & 0x33333333 ) << 2 ));
581- x = (((x & 0xf0f0f0f0 ) >> 4 ) | ((x & 0x0f0f0f0f ) << 4 ));
582- x = (((x & 0xff00ff00 ) >> 8 ) | ((x & 0x00ff00ff ) << 8 ));
583- uint64_t x64 = (x >> 16 ) | (x << 16 );
584- return x64 << 32 ;
578+ inline uint64_t clmul (uint32_t a, uint32_t b) {
579+ uint64_t ret = 0 , B = b;
580+ for (uint32_t i = 0 ; i < 32 ; ++i, a>>=1 , B<<=1 )
581+ if (a&1 ) ret ^= B;
582+ return ret;
585583}
586584
587- // do lshift with traditional scalar instructions
588585static uint32_t do_crc32c_lshift_sw (uint32_t crc1, uint32_t x) {
589- uint64_t q = 0 , xrev = bit_reverse32_64 (x);
590- for (int i = 0 ; i < 64 ; i++, xrev >>= 1 )
591- q = (q << 1 ) | __builtin_parity (crc1 & xrev);
586+ auto q = clmul (crc1, x);
592587 return crc32c_sw ((uint8_t *)&q, 8 , 0 );
593588}
594589
@@ -863,43 +858,40 @@ uint64_t crc64ecma_hw_sse128(const uint8_t *buf, size_t len, uint64_t crc) {
863858
864859__attribute__ ((aligned(16 ), used))
865860const static uint64_t crc64ecma_lshift_table[] = {
866- // by length of 32, 64, ..., 4G bytes
867- 0xe05dd497ca393ae4 , 0xb5ea1af9c013aca4 , 0x9e735cb59b4724da , 0x2ecbc6dd0447c685 ,
868- 0x15d325270d465dfe , 0x3f663335b446e329 , 0x68a971ffad4f1766 , 0xb7fd3098b8293475 ,
869- 0x1b225daef15ff37d , 0xea0ddceb7273a052 , 0x620648e8f01b0134 , 0xfba68785783fd770 ,
870- 0x4c0976fee55cc2f6 , 0x2b4d20a2ca417feb , 0xf8a6eaabee9c15b0 , 0x2068c6b926839fed ,
871- 0xc8fef34a6a17f35a , 0x0de5614bc4140f08 , 0x724fd734d6b83fac , 0x710def1593c89dfa ,
872- 0x337733943ec752b9 , 0x2a46a23b7286e04a , 0x5e7857ffbea8390a , 0xa2988a6572c32450 ,
873- 0xdb67937ee00fcea3 , 0xb427cfd9f16baa88 , 0x465d86031d8426b3 , 0x56ebf2f895082092 ,
861+ // by length of 1, 2, 4, 8, ..., 4G bytes
862+ 0x0100000000000000 , 0x0001000000000000 , 0x0000000100000000 , 0x0000000000000001 ,
863+ 0xdabe95afc7875f40 , 0x3be653a30fe1af51 , 0x081f6054a7842df4 , 0xd7d86b2af73de740 ,
864+ 0xf31fd9271e228b79 , 0x430af18f45bfec70 , 0xdb77241e49bab9e4 , 0xf42819f9b69abd6a ,
865+ 0x8366e0bd97880af6 , 0x3a0dc386f69b9d51 , 0x6537db4df869f6e6 , 0xa349f9a86a172f2e ,
866+ 0x629f50ac6fdf5d3a , 0xcb3c521f853fb4a1 , 0x6d1de95abb95074b , 0x9172d2fcc1985c9c ,
867+ 0xc996d644b4a25645 , 0x7d4a8a9b19cb8376 , 0xbd09c74d8e1fd5e2 , 0xb3223c4776751d27 ,
868+ 0x57f0d333946ab755 , 0x4b2c7a963a1e77c7 , 0x78ff6b0f1d73d8c7 , 0x740d65cdcd060ca9 ,
869+ 0xc7f7a66770f6ce2e , 0xe8195ee90ae40993 , 0xd9443a2e9c8cb27f , 0x17a88be197b6abdc ,
870+ 0x6975abb7ef289b8f ,
874871};
875872
876873// virtually pad `len2` bytes of 0 to source
877874// data, and return resulting crc value
878- // `len2` must be >= 32
879- static uint64_t crc64ecma_lshift_big (uint64_t crc1, uint32_t len2,
880- uint64_t (*shift)(uint64_t crc1, uint64_t x)) {
881- for (len2 >>= 5 ; len2; len2 &= len2 - 1 ) {
875+ static uint64_t do_crc64ecma_combine (uint64_t crc1, uint64_t crc2,
876+ uint32_t len2, uint64_t (*shift)(uint64_t crc1, uint64_t x)) {
877+ if (unlikely (!crc1)) return crc2;
878+ if (unlikely (!len2)) return crc1;
879+ for (; len2; len2 &= len2 - 1 ) {
882880 auto x = crc64ecma_lshift_table[__builtin_ctz (len2)];
883881 crc1 = shift (crc1, x);
884882 }
885- return crc1;
883+ return crc1 ^ crc2 ;
886884}
887885
888886static uint64_t do_crc64ecma_lshift_hw (uint64_t crc, uint64_t x) {
889887 __m128i crc1x, crc2x, crc3x, constx;
890- const __m128i rk5 = _mm_loadl_epi64 ((__m128i*)&rk[5 -1 ]);
891888 const __m128i rk7 = _mm_loadu_si128 ((__m128i*)&rk[7 -1 ]);
892889
893890 crc1x = _mm_cvtsi64_si128 (crc);
894891 constx = _mm_cvtsi64_si128 (x);
895892 crc1x = _mm_clmulepi64_si128 (crc1x, constx, 0x00 );
896893
897- // Fold to 64b
898- crc2x = _mm_clmulepi64_si128 (crc1x, rk5, 0x00 );
899- crc3x = _mm_bsrli_si128 (crc1x, 8 );
900- crc1x = _mm_xor_si128 (crc2x, crc3x);
901-
902- // Reduce
894+ // Barrett Reduce
903895 crc2x = _mm_clmulepi64_si128 (crc1x, rk7, 0x00 );
904896 crc3x = _mm_clmulepi64_si128 (crc2x, rk7, 0x10 );
905897 crc2x = _mm_bslli_si128 (crc2x, 8 );
@@ -909,41 +901,25 @@ static uint64_t do_crc64ecma_lshift_hw(uint64_t crc, uint64_t x) {
909901}
910902
911903uint64_t crc64ecma_combine_hw (uint64_t crc1, uint64_t crc2, uint32_t len2) {
912- if (unlikely (!crc1)) return crc2;
913- if (unlikely (!len2)) return crc1;
914- if (unlikely (len2 & 31 )) {
915- crc1 = ~crc64ecma_hw (zeros, len2 & 31 , ~crc1);
916- }
917- crc1 = crc64ecma_lshift_big (crc1, len2, do_crc64ecma_lshift_hw);
918- return crc1 ^ crc2;
904+ return do_crc64ecma_combine (crc1, crc2, len2, do_crc64ecma_lshift_hw);
919905}
920906
921907inline __uint128_t clmul (uint64_t a, uint64_t b) {
922- __uint128_t ret = 0 ;
923- for (uint32_t i = 0 ; i < 64 ; ++i)
924- if ((a >> i) & 1 )
925- ret ^= (__uint128_t )b << i;
908+ __uint128_t ret = 0 , B = b;
909+ for (uint32_t i = 0 ; i < 64 ; ++i, a>>=1 , B<<=1 )
910+ if (a&1 ) ret ^= B;
926911 return ret;
927912}
928913
929914static uint64_t do_crc64ecma_lshift_sw (uint64_t crc, uint64_t x) {
930915 __uint128_t crc1x = clmul (crc, x);
931- __uint128_t crc2x = clmul (crc1x, rk[5 -1 ]);
932- __uint128_t crc3x = crc1x >> 64 ;
933- crc1x = crc2x ^ crc3x;
934- crc2x = clmul (crc1x, rk[7 -1 ]);
935- crc3x = clmul (crc2x, rk[7 ]);
916+ __uint128_t crc2x = clmul (crc1x, rk[7 -1 ]);
917+ __uint128_t crc3x = clmul (crc2x, rk[7 ]);
936918 return (crc1x >> 64 ) ^ crc2x ^ (crc3x >> 64 );
937919}
938920
939921uint64_t crc64ecma_combine_sw (uint64_t crc1, uint64_t crc2, uint32_t len2) {
940- if (unlikely (!crc1)) return crc2;
941- if (unlikely (!len2)) return crc1;
942- if (unlikely (len2 & 31 )) {
943- crc1 = ~crc64ecma_sw (zeros, len2 & 31 , ~crc1);
944- }
945- crc1 = crc64ecma_lshift_big (crc1, len2, do_crc64ecma_lshift_sw);
946- return crc1 ^ crc2;
922+ return do_crc64ecma_combine (crc1, crc2, len2, do_crc64ecma_lshift_sw);
947923}
948924
949925#ifdef __x86_64__
0 commit comments