@@ -367,67 +367,161 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
367367}
368368
369369typedef struct {
370- U32 base ;
371- U32 curr ;
370+ U16 base ;
371+ U16 curr ;
372372} rankPos ;
373373
374374typedef nodeElt huffNodeTable [HUF_CTABLE_WORKSPACE_SIZE_U32 ];
375375
376- #define RANK_POSITION_TABLE_SIZE 32
376+ /* Number of buckets available for HUF_sort() */
377+ #define RANK_POSITION_TABLE_SIZE 64
378+
379+ /* Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
380+ * Strategy is to use as many buckets as possible for representing distinct
381+ * counts while using the remainder to represent all counts up to HUF_BLOCKSIZE_MAX
382+ * using log2 bucketing.
383+ *
384+ * To satisfy this requirement for 64 buckets, we can do the following:
385+ * Let buckets 0-51 represent distinct counts of [0, 51]
386+ * Let buckets 52 to 63 represent counts of [52, HUF_BLOCKSIZE_MAX == 131072].
387+ *
388+ * We determine this dynamically at compile-time as follows:
389+ */
390+ #define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_TABLE_SIZE - BIT_highbit32(HUF_BLOCKSIZE_MAX) - 1)
391+
392+ /* We don't actually need 17 buckets (assuming 2^17 maxcount) since the first few buckets in the
393+ * log2 bucketing representation are already covered by the distinct count bucketing.
394+ */
395+ #define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_DISTINCT_COUNT_CUTOFF - BIT_highbit32(RANK_POSITION_DISTINCT_COUNT_CUTOFF))
377396
378397typedef struct {
379398 huffNodeTable huffNodeTbl ;
380399 rankPos rankPosition [RANK_POSITION_TABLE_SIZE ];
381400} HUF_buildCTable_wksp_tables ;
382401
402+ /* Simple insertion sort by descending order */
403+ HINT_INLINE void HUF_insertionSort (nodeElt huffNode [], int const n )
404+ {
405+ int i ;
406+ for (i = 1 ; i < n ; i ++ ) {
407+ nodeElt const key = huffNode [i ];
408+ int j = i - 1 ;
409+ while (j >= 0 && huffNode [j ].count < key .count ) {
410+ huffNode [j + 1 ] = huffNode [j ];
411+ j -- ;
412+ }
413+ huffNode [j + 1 ] = key ;
414+ }
415+ }
416+
417+ /* Return the appropriate bucket index for a given count. See definition of
418+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
419+ */
420+ static U32 HUF_getIndex (U32 const count ) {
421+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF )
422+ ? count
423+ : BIT_highbit32 (count + 1 ) + RANK_POSITION_LOG_BUCKETS_BEGIN ;
424+ }
425+
426+
427+ /* Helper swap function for HUF_quickSortPartition() */
428+ static void HUF_swapNodes (nodeElt * a , nodeElt * b ) {
429+ nodeElt tmp = * a ;
430+ * a = * b ;
431+ * b = tmp ;
432+ }
433+
434+ /* Classic pivot helper function for quicksort. */
435+ static int HUF_quickSortPartition (nodeElt arr [], int low , int high ) {
436+ /* Simply select rightmost element as pivot. "Better" selectors like
437+ * median-of-three don't experimentally appear to have any benefit.
438+ */
439+ U32 const pivot = arr [high ].count ;
440+ int i = low - 1 ;
441+ int j = low ;
442+ for ( ; j < high ; j ++ ) {
443+ if (arr [j ].count > pivot ) {
444+ i ++ ;
445+ HUF_swapNodes (& arr [i ], & arr [j ]);
446+ }
447+ }
448+ HUF_swapNodes (& arr [i + 1 ], & arr [high ]);
449+ return i + 1 ;
450+ }
451+
452+ /* Classic quicksort by descending with partially iterative calls
453+ * to reduce worst case callstack size.
454+ */
455+ static void HUF_simpleQuickSort (nodeElt arr [], int low , int high ) {
456+ while (low < high ) {
457+ int const idx = HUF_quickSortPartition (arr , low , high );
458+ if (idx - low < high - idx ) {
459+ HUF_simpleQuickSort (arr , low , idx - 1 );
460+ low = idx + 1 ;
461+ } else {
462+ HUF_simpleQuickSort (arr , idx + 1 , high );
463+ high = idx - 1 ;
464+ }
465+ }
466+ }
467+
468+
383469/**
384470 * HUF_sort():
385471 * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
472+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
386473 *
387474 * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
388475 * Must have (maxSymbolValue + 1) entries.
389476 * @param[in] count Histogram of the symbols.
390477 * @param[in] maxSymbolValue Maximum symbol value.
391478 * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
392479 */
393- static void HUF_sort (nodeElt * huffNode , const unsigned * count , U32 maxSymbolValue , rankPos * rankPosition )
394- {
480+ static void HUF_sort (nodeElt huffNode [], const unsigned count [], U32 const maxSymbolValue , rankPos rankPosition []) {
395481 int n ;
396482 int const maxSymbolValue1 = (int )maxSymbolValue + 1 ;
397483
398484 /* Compute base and set curr to base.
399- * For symbol s let lowerRank = BIT_highbit32 (count[n]+1 ) and rank = lowerRank + 1.
400- * Then 2^lowerRank <= count[n]+1 <= 2^rank .
485+ * For symbol s let lowerRank = HUF_getIndex (count[n]) and rank = lowerRank + 1.
486+ * See HUF_getIndex to see bucketing strategy .
401487 * We attribute each symbol to lowerRank's base value, because we want to know where
402488 * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
403489 */
404490 ZSTD_memset (rankPosition , 0 , sizeof (* rankPosition ) * RANK_POSITION_TABLE_SIZE );
405491 for (n = 0 ; n < maxSymbolValue1 ; ++ n ) {
406- U32 lowerRank = BIT_highbit32 (count [n ] + 1 );
492+ U32 lowerRank = HUF_getIndex (count [n ]);
407493 rankPosition [lowerRank ].base ++ ;
408494 }
495+
409496 assert (rankPosition [RANK_POSITION_TABLE_SIZE - 1 ].base == 0 );
497+ /* Set up the rankPosition table */
410498 for (n = RANK_POSITION_TABLE_SIZE - 1 ; n > 0 ; -- n ) {
411499 rankPosition [n - 1 ].base += rankPosition [n ].base ;
412500 rankPosition [n - 1 ].curr = rankPosition [n - 1 ].base ;
413501 }
414- /* Sort */
502+
503+ /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
415504 for (n = 0 ; n < maxSymbolValue1 ; ++ n ) {
416505 U32 const c = count [n ];
417- U32 const r = BIT_highbit32 (c + 1 ) + 1 ;
418- U32 pos = rankPosition [r ].curr ++ ;
419- /* Insert into the correct position in the rank.
420- * We have at most 256 symbols, so this insertion should be fine.
421- */
422- while ((pos > rankPosition [r ].base ) && (c > huffNode [pos - 1 ].count )) {
423- huffNode [pos ] = huffNode [pos - 1 ];
424- pos -- ;
425- }
506+ U32 const r = HUF_getIndex (c ) + 1 ;
507+ U32 const pos = rankPosition [r ].curr ++ ;
426508 huffNode [pos ].count = c ;
427509 huffNode [pos ].byte = (BYTE )n ;
428510 }
429- }
430511
512+ /* Sort each bucket. */
513+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF ; n < RANK_POSITION_TABLE_SIZE - 1 ; ++ n ) {
514+ U32 const bucketSize = rankPosition [n ].curr - rankPosition [n ].base ;
515+ U32 const bucketStartIdx = rankPosition [n ].base ;
516+ if (bucketSize <= 1 ) {
517+ continue ;
518+ } else if (bucketSize <= 128 ) {
519+ HUF_insertionSort (huffNode + bucketStartIdx , bucketSize );
520+ } else {
521+ HUF_simpleQuickSort (huffNode + bucketStartIdx , 0 , bucketSize - 1 );
522+ }
523+ }
524+ }
431525
432526/** HUF_buildCTable_wksp() :
433527 * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
0 commit comments