[lazy] Skip over incompressible data

terrelln · terrelln · commit acca0367c6de · 2023-03-14T16:26:27.000-07:00
Every 256 bytes the lazy match finders process without finding a match,
they will increase their step size by 1. So for bytes [0, 256) they search
every position, for bytes [256, 512) they search every other position,
and so on. However, they currently still insert every position into
their hash tables. This is different from fast &amp; dfast, which only
insert the positions they search.

This PR changes that, so now after we've searched 2KB without finding
any matches, at which point we'll only be searching one in 9 positions,
we'll stop inserting every position, and only insert the positions we
search. The exact cutoff of 2KB isn't terribly important, I've just
selected a cutoff that is reasonably large, to minimize the impact on
"normal" data.

This PR only adds skipping to greedy, lazy, and lazy2, but does not
touch btlazy2.

| Dataset | Level | Compiler     | CSize ∆ | Speed ∆ |
|---------|-------|--------------|---------|---------|
| Random  |     5 | clang-14.0.6 |    0.0% |   +704% |
| Random  |     5 | gcc-12.2.0   |    0.0% |   +670% |
| Random  |     7 | clang-14.0.6 |    0.0% |   +679% |
| Random  |     7 | gcc-12.2.0   |    0.0% |   +657% |
| Random  |    12 | clang-14.0.6 |    0.0% |  +1355% |
| Random  |    12 | gcc-12.2.0   |    0.0% |  +1331% |
| Silesia |     5 | clang-14.0.6 | +0.002% |  +0.35% |
| Silesia |     5 | gcc-12.2.0   | +0.002% |  +2.45% |
| Silesia |     7 | clang-14.0.6 | +0.001% |  -1.40% |
| Silesia |     7 | gcc-12.2.0   | +0.007% |  +0.13% |
| Silesia |    12 | clang-14.0.6 | +0.011% | +22.70% |
| Silesia |    12 | gcc-12.2.0   | +0.011% |  -6.68% |
| Enwik8  |     5 | clang-14.0.6 |    0.0% |  -1.02% |
| Enwik8  |     5 | gcc-12.2.0   |    0.0% |  +0.34% |
| Enwik8  |     7 | clang-14.0.6 |    0.0% |  -1.22% |
| Enwik8  |     7 | gcc-12.2.0   |    0.0% |  -0.72% |
| Enwik8  |    12 | clang-14.0.6 |    0.0% | +26.19% |
| Enwik8  |    12 | gcc-12.2.0   |    0.0% |  -5.70% |

The speed difference for clang at level 12 is real, but is probably
caused by some sort of alignment or codegen issues. clang is
significantly slower than gcc before this PR, but gets up to parity with
it.

I also measured the ratio difference for the HC match finder, and it
looks basically the same as the row-based match finder. The speedup on
random data looks similar. And performance is about neutral, without the
big difference at level 12 for either clang or gcc.
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
@@ -1947,6 +1947,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
     }
 
     ms->hashLog3 = hashLog3;
+    ms->lazySkipping = 0;
 
     ZSTD_invalidateMatchState(ms);
 
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
@@ -249,6 +249,13 @@ struct ZSTD_matchState_t {
      * This behavior is controlled from the cctx ms.
      * This parameter has no effect in the cdict ms. */
     int prefetchCDictTables;
+
+    /* When == 0, lazy match finders insert every position.
+     * When != 0, lazy match finders only insert positions they search.
+     * This allows them to skip much faster over incompressible data,
+     * at a small cost to compression ratio.
+     */
+    int lazySkipping;
 };
 
 typedef struct {
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
@@ -12,6 +12,8 @@
 #include "zstd_lazy.h"
 #include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
 
+#define kLazySkippingStep 8
+
 
 /*-*************************************
 *  Binary Tree search
@@ -618,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
 FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
                         ZSTD_matchState_t* ms,
                         const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
+                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
 {
     U32* const hashTable  = ms->hashTable;
     const U32 hashLog = cParams->hashLog;
@@ -633,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
         hashTable[h] = idx;
         idx++;
+        /* Stop inserting every position when in the lazy skipping mode. */
+        if (lazySkipping)
+            break;
     }
 
     ms->nextToUpdate = target;
@@ -641,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
 
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
 }
 
 /* inlining is important to hardwire a hot branch (template emulation) */
@@ -685,7 +690,7 @@ size_t ZSTD_HcFindBestMatch(
     }
 
     /* HC4 match finder */
-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
 
     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
@@ -866,7 +871,6 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
     U32* const hashTable = ms->hashTable;
     BYTE* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
-    U32 hashSaltEntropyCollected = 0;
     const BYTE* const base = ms->window.base;
 
     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
@@ -881,9 +885,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
         assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
         tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
         row[pos] = updateStartIdx;
-        hashSaltEntropyCollected = hash;
     }
-    ms->hashSaltEntropy += hashSaltEntropyCollected; /* collect salt entropy */
 }
 
 /* ZSTD_row_update_internal():
@@ -1144,6 +1146,7 @@ size_t ZSTD_RowFindBestMatch(
     const U64 hashSalt = ms->hashSalt;
     U32 nbAttempts = 1U << cappedSearchLog;
     size_t ml=4-1;
+    U32 hash;
 
     /* DMS/DDS variables that may be referenced laster */
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1177,9 +1180,19 @@ size_t ZSTD_RowFindBestMatch(
     }
 
     /* Update the hashTable and tagTable up to (but not including) ip */
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    if (!ms->lazySkipping) {
+        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
+    } else {
+        /* Stop inserting every position when in the lazy skipping mode.
+         * The hash cache is also not kept up to date in this mode.
+         */
+        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+        ms->nextToUpdate = curr;
+    }
+    ms->hashSaltEntropy += hash; /* collect salt entropy */
+
     {   /* Get the hash for ip, compute the appropriate row */
-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
         U32* const row = hashTable + relRow;
@@ -1527,10 +1540,11 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <= dictAndPrefixLength);
     }
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                            ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1574,7 +1588,16 @@ ZSTD_compressBlock_lazy_generic(
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
+            ip += step;
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1678,6 +1701,13 @@ ZSTD_compressBlock_lazy_generic(
             ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         if (isDxS) {
@@ -1895,12 +1925,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     /* init */
     ip += (ip == prefixStart);
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                               ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1938,7 +1969,16 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
+            ip += step + 1;   /* jump faster over incompressible sections */
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -2024,6 +2064,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         while (ip <= ilimit) {

Original file line number	Diff line number	Diff line change
`@@ -1947,6 +1947,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,`
`1947`	`1947`	`}`
`1948`	`1948`
`1949`	`1949`	`ms->hashLog3 = hashLog3;`
	`1950`	`+ ms->lazySkipping = 0;`
`1950`	`1951`
`1951`	`1952`	`ZSTD_invalidateMatchState(ms);`
`1952`	`1953`