Skip to content

Commit acca036

Browse files
committed
[lazy] Skip over incompressible data
Every 256 bytes the lazy match finders process without finding a match, they will increase their step size by 1. So for bytes [0, 256) they search every position, for bytes [256, 512) they search every other position, and so on. However, they currently still insert every position into their hash tables. This is different from fast & dfast, which only insert the positions they search. This PR changes that, so now after we've searched 2KB without finding any matches, at which point we'll only be searching one in 9 positions, we'll stop inserting every position, and only insert the positions we search. The exact cutoff of 2KB isn't terribly important, I've just selected a cutoff that is reasonably large, to minimize the impact on "normal" data. This PR only adds skipping to greedy, lazy, and lazy2, but does not touch btlazy2. | Dataset | Level | Compiler | CSize ∆ | Speed ∆ | |---------|-------|--------------|---------|---------| | Random | 5 | clang-14.0.6 | 0.0% | +704% | | Random | 5 | gcc-12.2.0 | 0.0% | +670% | | Random | 7 | clang-14.0.6 | 0.0% | +679% | | Random | 7 | gcc-12.2.0 | 0.0% | +657% | | Random | 12 | clang-14.0.6 | 0.0% | +1355% | | Random | 12 | gcc-12.2.0 | 0.0% | +1331% | | Silesia | 5 | clang-14.0.6 | +0.002% | +0.35% | | Silesia | 5 | gcc-12.2.0 | +0.002% | +2.45% | | Silesia | 7 | clang-14.0.6 | +0.001% | -1.40% | | Silesia | 7 | gcc-12.2.0 | +0.007% | +0.13% | | Silesia | 12 | clang-14.0.6 | +0.011% | +22.70% | | Silesia | 12 | gcc-12.2.0 | +0.011% | -6.68% | | Enwik8 | 5 | clang-14.0.6 | 0.0% | -1.02% | | Enwik8 | 5 | gcc-12.2.0 | 0.0% | +0.34% | | Enwik8 | 7 | clang-14.0.6 | 0.0% | -1.22% | | Enwik8 | 7 | gcc-12.2.0 | 0.0% | -0.72% | | Enwik8 | 12 | clang-14.0.6 | 0.0% | +26.19% | | Enwik8 | 12 | gcc-12.2.0 | 0.0% | -5.70% | The speed difference for clang at level 12 is real, but is probably caused by some sort of alignment or codegen issues. clang is significantly slower than gcc before this PR, but gets up to parity with it. I also measured the ratio difference for the HC match finder, and it looks basically the same as the row-based match finder. The speedup on random data looks similar. And performance is about neutral, without the big difference at level 12 for either clang or gcc.
1 parent 488e45f commit acca036

File tree

3 files changed

+71
-16
lines changed

3 files changed

+71
-16
lines changed

lib/compress/zstd_compress.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,6 +1947,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
19471947
}
19481948

19491949
ms->hashLog3 = hashLog3;
1950+
ms->lazySkipping = 0;
19501951

19511952
ZSTD_invalidateMatchState(ms);
19521953

lib/compress/zstd_compress_internal.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,13 @@ struct ZSTD_matchState_t {
249249
* This behavior is controlled from the cctx ms.
250250
* This parameter has no effect in the cdict ms. */
251251
int prefetchCDictTables;
252+
253+
/* When == 0, lazy match finders insert every position.
254+
* When != 0, lazy match finders only insert positions they search.
255+
* This allows them to skip much faster over incompressible data,
256+
* at a small cost to compression ratio.
257+
*/
258+
int lazySkipping;
252259
};
253260

254261
typedef struct {

lib/compress/zstd_lazy.c

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include "zstd_lazy.h"
1313
#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
1414

15+
#define kLazySkippingStep 8
16+
1517

1618
/*-*************************************
1719
* Binary Tree search
@@ -618,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
618620
FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
619621
ZSTD_matchState_t* ms,
620622
const ZSTD_compressionParameters* const cParams,
621-
const BYTE* ip, U32 const mls)
623+
const BYTE* ip, U32 const mls, U32 const lazySkipping)
622624
{
623625
U32* const hashTable = ms->hashTable;
624626
const U32 hashLog = cParams->hashLog;
@@ -633,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
633635
NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
634636
hashTable[h] = idx;
635637
idx++;
638+
/* Stop inserting every position when in the lazy skipping mode. */
639+
if (lazySkipping)
640+
break;
636641
}
637642

638643
ms->nextToUpdate = target;
@@ -641,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
641646

642647
U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
643648
const ZSTD_compressionParameters* const cParams = &ms->cParams;
644-
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
649+
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
645650
}
646651

647652
/* inlining is important to hardwire a hot branch (template emulation) */
@@ -685,7 +690,7 @@ size_t ZSTD_HcFindBestMatch(
685690
}
686691

687692
/* HC4 match finder */
688-
matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
693+
matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
689694

690695
for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
691696
size_t currentMl=0;
@@ -866,7 +871,6 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
866871
U32* const hashTable = ms->hashTable;
867872
BYTE* const tagTable = ms->tagTable;
868873
U32 const hashLog = ms->rowHashLog;
869-
U32 hashSaltEntropyCollected = 0;
870874
const BYTE* const base = ms->window.base;
871875

872876
DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
@@ -881,9 +885,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
881885
assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
882886
tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
883887
row[pos] = updateStartIdx;
884-
hashSaltEntropyCollected = hash;
885888
}
886-
ms->hashSaltEntropy += hashSaltEntropyCollected; /* collect salt entropy */
887889
}
888890

889891
/* ZSTD_row_update_internal():
@@ -1144,6 +1146,7 @@ size_t ZSTD_RowFindBestMatch(
11441146
const U64 hashSalt = ms->hashSalt;
11451147
U32 nbAttempts = 1U << cappedSearchLog;
11461148
size_t ml=4-1;
1149+
U32 hash;
11471150

11481151
/* DMS/DDS variables that may be referenced laster */
11491152
const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1177,9 +1180,19 @@ size_t ZSTD_RowFindBestMatch(
11771180
}
11781181

11791182
/* Update the hashTable and tagTable up to (but not including) ip */
1180-
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1183+
if (!ms->lazySkipping) {
1184+
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1185+
hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
1186+
} else {
1187+
/* Stop inserting every position when in the lazy skipping mode.
1188+
* The hash cache is also not kept up to date in this mode.
1189+
*/
1190+
hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
1191+
ms->nextToUpdate = curr;
1192+
}
1193+
ms->hashSaltEntropy += hash; /* collect salt entropy */
1194+
11811195
{ /* Get the hash for ip, compute the appropriate row */
1182-
U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
11831196
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
11841197
U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
11851198
U32* const row = hashTable + relRow;
@@ -1527,10 +1540,11 @@ ZSTD_compressBlock_lazy_generic(
15271540
assert(offset_2 <= dictAndPrefixLength);
15281541
}
15291542

1543+
/* Reset the lazy skipping state */
1544+
ms->lazySkipping = 0;
1545+
15301546
if (searchMethod == search_rowHash) {
1531-
ZSTD_row_fillHashCache(ms, base, rowLog,
1532-
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1533-
ms->nextToUpdate, ilimit);
1547+
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
15341548
}
15351549

15361550
/* Match Loop */
@@ -1574,7 +1588,16 @@ ZSTD_compressBlock_lazy_generic(
15741588
}
15751589

15761590
if (matchLength < 4) {
1577-
ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1591+
size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
1592+
ip += step;
1593+
/* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1594+
* In this mode we stop inserting every position into our tables, and only insert
1595+
* positions that we search, which is one in step positions.
1596+
* The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1597+
* so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1598+
* triggered once we've gone 2KB without finding any matches.
1599+
*/
1600+
ms->lazySkipping = step > kLazySkippingStep;
15781601
continue;
15791602
}
15801603

@@ -1678,6 +1701,13 @@ ZSTD_compressBlock_lazy_generic(
16781701
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
16791702
anchor = ip = start + matchLength;
16801703
}
1704+
if (ms->lazySkipping) {
1705+
/* We've found a match, disable lazy skipping mode, and refill the hash cache. */
1706+
if (searchMethod == search_rowHash) {
1707+
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1708+
}
1709+
ms->lazySkipping = 0;
1710+
}
16811711

16821712
/* check immediate repcode */
16831713
if (isDxS) {
@@ -1895,12 +1925,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
18951925

18961926
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
18971927

1928+
/* Reset the lazy skipping state */
1929+
ms->lazySkipping = 0;
1930+
18981931
/* init */
18991932
ip += (ip == prefixStart);
19001933
if (searchMethod == search_rowHash) {
1901-
ZSTD_row_fillHashCache(ms, base, rowLog,
1902-
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1903-
ms->nextToUpdate, ilimit);
1934+
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
19041935
}
19051936

19061937
/* Match Loop */
@@ -1938,7 +1969,16 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
19381969
}
19391970

19401971
if (matchLength < 4) {
1941-
ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1972+
size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
1973+
ip += step + 1; /* jump faster over incompressible sections */
1974+
/* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
1975+
* In this mode we stop inserting every position into our tables, and only insert
1976+
* positions that we search, which is one in step positions.
1977+
* The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1978+
* so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1979+
* triggered once we've gone 2KB without finding any matches.
1980+
*/
1981+
ms->lazySkipping = step > kLazySkippingStep;
19421982
continue;
19431983
}
19441984

@@ -2024,6 +2064,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
20242064
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
20252065
anchor = ip = start + matchLength;
20262066
}
2067+
if (ms->lazySkipping) {
2068+
/* We've found a match, disable lazy skipping mode, and refill the hash cache. */
2069+
if (searchMethod == search_rowHash) {
2070+
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
2071+
}
2072+
ms->lazySkipping = 0;
2073+
}
20272074

20282075
/* check immediate repcode */
20292076
while (ip <= ilimit) {

0 commit comments

Comments
 (0)