Skip to content

Commit 5151272

Browse files
committed
Improve Huffman sorting algorithm
1 parent dd4f6aa commit 5151272

File tree

1 file changed

+113
-19
lines changed

1 file changed

+113
-19
lines changed

lib/compress/huf_compress.c

Lines changed: 113 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -367,67 +367,161 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
367367
}
368368

369369
typedef struct {
370-
U32 base;
371-
U32 curr;
370+
U16 base;
371+
U16 curr;
372372
} rankPos;
373373

374374
typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
375375

376-
#define RANK_POSITION_TABLE_SIZE 32
376+
/* Number of buckets available for HUF_sort() */
377+
#define RANK_POSITION_TABLE_SIZE 64
378+
379+
/* Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
380+
* Strategy is to use as many buckets as possible for representing distinct
381+
* counts while using the remainder to represent all counts up to HUF_BLOCKSIZE_MAX
382+
* using log2 bucketing.
383+
*
384+
* To satisfy this requirement for 64 buckets, we can do the following:
385+
* Let buckets 0-51 represent distinct counts of [0, 51]
386+
* Let buckets 52 to 63 represent counts of [52, HUF_BLOCKSIZE_MAX == 131072].
387+
*
388+
* We determine this dynamically at compile-time as follows:
389+
*/
390+
#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_TABLE_SIZE - BIT_highbit32(HUF_BLOCKSIZE_MAX) - 1)
391+
392+
/* We don't actually need 17 buckets (assuming 2^17 maxcount) since the first few buckets in the
393+
* log2 bucketing representation are already covered by the distinct count bucketing.
394+
*/
395+
#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_DISTINCT_COUNT_CUTOFF - BIT_highbit32(RANK_POSITION_DISTINCT_COUNT_CUTOFF))
377396

378397
typedef struct {
379398
huffNodeTable huffNodeTbl;
380399
rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
381400
} HUF_buildCTable_wksp_tables;
382401

402+
/* Simple insertion sort by descending order */
403+
HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const n)
404+
{
405+
int i;
406+
for (i = 1; i < n; i++) {
407+
nodeElt const key = huffNode[i];
408+
int j = i - 1;
409+
while (j >= 0 && huffNode[j].count < key.count) {
410+
huffNode[j + 1] = huffNode[j];
411+
j--;
412+
}
413+
huffNode[j + 1] = key;
414+
}
415+
}
416+
417+
/* Return the appropriate bucket index for a given count. See definition of
418+
* RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
419+
*/
420+
static U32 HUF_getIndex(U32 const count) {
421+
return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
422+
? count
423+
: BIT_highbit32(count + 1) + RANK_POSITION_LOG_BUCKETS_BEGIN;
424+
}
425+
426+
427+
/* Helper swap function for HUF_quickSortPartition() */
428+
static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
429+
nodeElt tmp = *a;
430+
*a = *b;
431+
*b = tmp;
432+
}
433+
434+
/* Classic pivot helper function for quicksort. */
435+
static int HUF_quickSortPartition(nodeElt arr[], int low, int high) {
436+
/* Simply select rightmost element as pivot. "Better" selectors like
437+
* median-of-three don't experimentally appear to have any benefit.
438+
*/
439+
U32 const pivot = arr[high].count;
440+
int i = low - 1;
441+
int j = low;
442+
for ( ; j < high; j++) {
443+
if (arr[j].count > pivot) {
444+
i++;
445+
HUF_swapNodes(&arr[i], &arr[j]);
446+
}
447+
}
448+
HUF_swapNodes(&arr[i + 1], &arr[high]);
449+
return i + 1;
450+
}
451+
452+
/* Classic quicksort by descending with partially iterative calls
453+
* to reduce worst case callstack size.
454+
*/
455+
static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
456+
while (low < high) {
457+
int const idx = HUF_quickSortPartition(arr, low, high);
458+
if (idx - low < high - idx) {
459+
HUF_simpleQuickSort(arr, low, idx - 1);
460+
low = idx + 1;
461+
} else {
462+
HUF_simpleQuickSort(arr, idx + 1, high);
463+
high = idx - 1;
464+
}
465+
}
466+
}
467+
468+
383469
/**
384470
* HUF_sort():
385471
* Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
472+
* This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
386473
*
387474
* @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
388475
* Must have (maxSymbolValue + 1) entries.
389476
* @param[in] count Histogram of the symbols.
390477
* @param[in] maxSymbolValue Maximum symbol value.
391478
* @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
392479
*/
393-
static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
394-
{
480+
static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
395481
int n;
396482
int const maxSymbolValue1 = (int)maxSymbolValue + 1;
397483

398484
/* Compute base and set curr to base.
399-
* For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1.
400-
* Then 2^lowerRank <= count[n]+1 <= 2^rank.
485+
* For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
486+
* See HUF_getIndex to see bucketing strategy.
401487
* We attribute each symbol to lowerRank's base value, because we want to know where
402488
* each rank begins in the output, so for rank R we want to count ranks R+1 and above.
403489
*/
404490
ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
405491
for (n = 0; n < maxSymbolValue1; ++n) {
406-
U32 lowerRank = BIT_highbit32(count[n] + 1);
492+
U32 lowerRank = HUF_getIndex(count[n]);
407493
rankPosition[lowerRank].base++;
408494
}
495+
409496
assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
497+
/* Set up the rankPosition table */
410498
for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
411499
rankPosition[n-1].base += rankPosition[n].base;
412500
rankPosition[n-1].curr = rankPosition[n-1].base;
413501
}
414-
/* Sort */
502+
503+
/* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
415504
for (n = 0; n < maxSymbolValue1; ++n) {
416505
U32 const c = count[n];
417-
U32 const r = BIT_highbit32(c+1) + 1;
418-
U32 pos = rankPosition[r].curr++;
419-
/* Insert into the correct position in the rank.
420-
* We have at most 256 symbols, so this insertion should be fine.
421-
*/
422-
while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
423-
huffNode[pos] = huffNode[pos-1];
424-
pos--;
425-
}
506+
U32 const r = HUF_getIndex(c) + 1;
507+
U32 const pos = rankPosition[r].curr++;
426508
huffNode[pos].count = c;
427509
huffNode[pos].byte = (BYTE)n;
428510
}
429-
}
430511

512+
/* Sort each bucket. */
513+
for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
514+
U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
515+
U32 const bucketStartIdx = rankPosition[n].base;
516+
if (bucketSize <= 1) {
517+
continue;
518+
} else if (bucketSize <= 128) {
519+
HUF_insertionSort(huffNode + bucketStartIdx, bucketSize);
520+
} else {
521+
HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
522+
}
523+
}
524+
}
431525

432526
/** HUF_buildCTable_wksp() :
433527
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.

0 commit comments

Comments
 (0)