@@ -368,67 +368,175 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
368368}
369369
370370typedef struct {
371- U32 base ;
372- U32 curr ;
371+ U16 base ;
372+ U16 curr ;
373373} rankPos ;
374374
375375typedef nodeElt huffNodeTable [HUF_CTABLE_WORKSPACE_SIZE_U32 ];
376376
377- #define RANK_POSITION_TABLE_SIZE 32
377+ /* Number of buckets available for HUF_sort() */
378+ #define RANK_POSITION_TABLE_SIZE 128
378379
379380typedef struct {
380381 huffNodeTable huffNodeTbl ;
381382 rankPos rankPosition [RANK_POSITION_TABLE_SIZE ];
382383} HUF_buildCTable_wksp_tables ;
383384
385+ /* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
386+ * Strategy is to use as many buckets as possible for representing distinct
387+ * counts while using the remainder to represent all counts up to HUF_BLOCKSIZE_MAX
388+ * using log2 bucketing.
389+ *
390+ * To satisfy this requirement for 128 buckets, we can do the following:
391+ * Let buckets 0-114 represent distinct counts of [0, 114]
392+ * Let buckets 115 to 126 represent counts of [115, HUF_BLOCKSIZE_MAX]. (the final bucket 127 must remain empty)
393+ *
394+ * Note that we don't actually need 17 buckets (assuming 2^17 maxcount) for log2 bucketing since
395+ * the first few buckets in the log2 bucketing representation are already covered by the distinct count bucketing.
396+ */
397+ #define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - BIT_highbit32(HUF_BLOCKSIZE_MAX) - 1
398+ #define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN)
399+
400+ /* Return the appropriate bucket index for a given count. See definition of
401+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
402+ */
403+ static U32 HUF_getIndex (U32 const count ) {
404+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF )
405+ ? count
406+ : BIT_highbit32 (count ) + RANK_POSITION_LOG_BUCKETS_BEGIN ;
407+ }
408+
409+ /* Helper swap function for HUF_quickSortPartition() */
410+ static void HUF_swapNodes (nodeElt * a , nodeElt * b ) {
411+ nodeElt tmp = * a ;
412+ * a = * b ;
413+ * b = tmp ;
414+ }
415+
416+ /* Returns 0 if the huffNode array is not sorted by descending count */
417+ UNUSED_ATTR
418+ static int HUF_isSorted (nodeElt huffNode [], U32 const maxSymbolValue1 ) {
419+ U32 i ;
420+ for (i = 1 ; i < maxSymbolValue1 ; ++ i ) {
421+ if (huffNode [i ].count > huffNode [i - 1 ].count ) {
422+ return 0 ;
423+ }
424+ }
425+ return 1 ;
426+ }
427+
428+ /* Insertion sort by descending order */
429+ HINT_INLINE void HUF_insertionSort (nodeElt huffNode [], int const low , int const high ) {
430+ int i ;
431+ int const size = high - low + 1 ;
432+ huffNode += low ;
433+ for (i = 1 ; i < size ; ++ i ) {
434+ nodeElt const key = huffNode [i ];
435+ int j = i - 1 ;
436+ while (j >= 0 && huffNode [j ].count < key .count ) {
437+ huffNode [j + 1 ] = huffNode [j ];
438+ j -- ;
439+ }
440+ huffNode [j + 1 ] = key ;
441+ }
442+ }
443+
444+ /* Pivot helper function for quicksort. */
445+ static int HUF_quickSortPartition (nodeElt arr [], int const low , int const high ) {
446+ /* Simply select rightmost element as pivot. "Better" selectors like
447+ * median-of-three don't experimentally appear to have any benefit.
448+ */
449+ U32 const pivot = arr [high ].count ;
450+ int i = low - 1 ;
451+ int j = low ;
452+ for ( ; j < high ; j ++ ) {
453+ if (arr [j ].count > pivot ) {
454+ i ++ ;
455+ HUF_swapNodes (& arr [i ], & arr [j ]);
456+ }
457+ }
458+ HUF_swapNodes (& arr [i + 1 ], & arr [high ]);
459+ return i + 1 ;
460+ }
461+
462+ /* Classic quicksort by descending with partially iterative calls
463+ * to reduce worst case callstack size.
464+ */
465+ static void HUF_simpleQuickSort (nodeElt arr [], int low , int high ) {
466+ int const kInsertionSortThreshold = 8 ;
467+ if (high - low < kInsertionSortThreshold ) {
468+ HUF_insertionSort (arr , low , high );
469+ return ;
470+ }
471+ while (low < high ) {
472+ int const idx = HUF_quickSortPartition (arr , low , high );
473+ if (idx - low < high - idx ) {
474+ HUF_simpleQuickSort (arr , low , idx - 1 );
475+ low = idx + 1 ;
476+ } else {
477+ HUF_simpleQuickSort (arr , idx + 1 , high );
478+ high = idx - 1 ;
479+ }
480+ }
481+ }
482+
384483/**
385484 * HUF_sort():
386485 * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
486+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
387487 *
388488 * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
389489 * Must have (maxSymbolValue + 1) entries.
390490 * @param[in] count Histogram of the symbols.
391491 * @param[in] maxSymbolValue Maximum symbol value.
392492 * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
393493 */
394- static void HUF_sort (nodeElt * huffNode , const unsigned * count , U32 maxSymbolValue , rankPos * rankPosition )
395- {
396- int n ;
397- int const maxSymbolValue1 = (int )maxSymbolValue + 1 ;
494+ static void HUF_sort (nodeElt huffNode [], const unsigned count [], U32 const maxSymbolValue , rankPos rankPosition []) {
495+ U32 n ;
496+ U32 const maxSymbolValue1 = maxSymbolValue + 1 ;
398497
399498 /* Compute base and set curr to base.
400- * For symbol s let lowerRank = BIT_highbit32 (count[n]+1 ) and rank = lowerRank + 1.
401- * Then 2^lowerRank <= count[n]+1 <= 2^rank .
499+ * For symbol s let lowerRank = HUF_getIndex (count[n]) and rank = lowerRank + 1.
500+ * See HUF_getIndex to see bucketing strategy .
402501 * We attribute each symbol to lowerRank's base value, because we want to know where
403502 * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
404503 */
405504 ZSTD_memset (rankPosition , 0 , sizeof (* rankPosition ) * RANK_POSITION_TABLE_SIZE );
406505 for (n = 0 ; n < maxSymbolValue1 ; ++ n ) {
407- U32 lowerRank = BIT_highbit32 (count [n ] + 1 );
506+ U32 lowerRank = HUF_getIndex (count [n ]);
507+ assert (lowerRank < RANK_POSITION_TABLE_SIZE - 1 );
408508 rankPosition [lowerRank ].base ++ ;
409509 }
510+
410511 assert (rankPosition [RANK_POSITION_TABLE_SIZE - 1 ].base == 0 );
512+ /* Set up the rankPosition table */
411513 for (n = RANK_POSITION_TABLE_SIZE - 1 ; n > 0 ; -- n ) {
412514 rankPosition [n - 1 ].base += rankPosition [n ].base ;
413515 rankPosition [n - 1 ].curr = rankPosition [n - 1 ].base ;
414516 }
415- /* Sort */
517+
518+ /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
416519 for (n = 0 ; n < maxSymbolValue1 ; ++ n ) {
417520 U32 const c = count [n ];
418- U32 const r = BIT_highbit32 (c + 1 ) + 1 ;
419- U32 pos = rankPosition [r ].curr ++ ;
420- /* Insert into the correct position in the rank.
421- * We have at most 256 symbols, so this insertion should be fine.
422- */
423- while ((pos > rankPosition [r ].base ) && (c > huffNode [pos - 1 ].count )) {
424- huffNode [pos ] = huffNode [pos - 1 ];
425- pos -- ;
426- }
521+ U32 const r = HUF_getIndex (c ) + 1 ;
522+ U32 const pos = rankPosition [r ].curr ++ ;
523+ assert (pos < maxSymbolValue1 );
427524 huffNode [pos ].count = c ;
428525 huffNode [pos ].byte = (BYTE )n ;
429526 }
430- }
431527
528+ /* Sort each bucket. */
529+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF ; n < RANK_POSITION_TABLE_SIZE - 1 ; ++ n ) {
530+ U32 const bucketSize = rankPosition [n ].curr - rankPosition [n ].base ;
531+ U32 const bucketStartIdx = rankPosition [n ].base ;
532+ if (bucketSize > 1 ) {
533+ assert (bucketStartIdx < maxSymbolValue1 );
534+ HUF_simpleQuickSort (huffNode + bucketStartIdx , 0 , bucketSize - 1 );
535+ }
536+ }
537+
538+ assert (HUF_isSorted (huffNode , maxSymbolValue1 ));
539+ }
432540
433541/** HUF_buildCTable_wksp() :
434542 * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
0 commit comments