@@ -346,91 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran
346346 }
347347}
348348
349- __global__ void kHistogramScatterAdd2D (float * histogram, int *index1, int *index2, float *src, const int maxidx1, const int n)
350- {
351- const int tid = threadIdx.x + (blockDim.x *blockIdx.x );
352- const int numThreads = blockDim.x *gridDim.x ;
353-
354- for (int i = tid; i < n; i+=numThreads)
355- {
356- int idx = (index1[i]*maxidx1) + index2[i];
357- atomicAdd (&histogram[idx], src[i]);
358- }
359- }
360-
361- #define THREADS_ESTIMATE 512
362- #define NUM_ESTIMATE 8
363- #define BLOCK_ESTIMATE 4096
364-
365- template <typename T>
366- __launch_bounds__ (THREADS_ESTIMATE, 1 )
367- __global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n)
368- {
369- const int n_full = (BLOCK_ESTIMATE*(n/BLOCK_ESTIMATE)) + (n % BLOCK_ESTIMATE == 0 ? 0 : BLOCK_ESTIMATE);
370- int valid_items = (blockIdx.x +1 == gridDim.x ) ? n - (blockIdx.x *BLOCK_ESTIMATE) : BLOCK_ESTIMATE;
371- const int base_idx = (blockIdx.x * BLOCK_ESTIMATE);
372- const float reciprocal_num_blocks = 1 .0f /(n < 4096 ? 1 .0f : (n/BLOCK_ESTIMATE));
373-
374- T vals[NUM_ESTIMATE];
375-
376- typedef hipcub::BlockRadixSort<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::NullType, 4 , true , hipcub::BLOCK_SCAN_RAKING> BlockRadixSort;
377- typedef hipcub::BlockLoad<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
378-
379- __shared__ union {
380- typename LoadFloat::TempStorage loadf;
381- typename BlockRadixSort::TempStorage sort;
382- int smem_qidx[BLOCK_ESTIMATE];
383- } temp_storage;
384-
385- for (unsigned int i = base_idx; i < n_full; i += gridDim.x *BLOCK_ESTIMATE)
386- {
387- valid_items = n - i > BLOCK_ESTIMATE ? BLOCK_ESTIMATE : n - i;
388-
389- // do not process half-blocks
390- if (valid_items < BLOCK_ESTIMATE && n > BLOCK_ESTIMATE){ continue ; }
391-
392- #pragma unroll 4
393- for (int j = 0 ; j < NUM_ESTIMATE; j++)
394- vals[j] = max_val;
395-
396- __syncthreads ();
397- LoadFloat (temp_storage.loadf ).Load (&(A[i]), vals, valid_items);
398-
399- #pragma unroll 4
400- for (int j = 0 ; j < NUM_ESTIMATE; j++)
401- vals[j] = ((float )vals[j]) * reciprocal_num_blocks;
402-
403-
404- __syncthreads ();
405- // sort into striped pattern to mitigate bank conflicts
406- // striped pattern index for thread 0 [0, 1024, 2048, 3096]
407- // striped pattern index for thread 1 [1, 1025, 2049, 3097]
408- BlockRadixSort (temp_storage.sort ).SortBlockedToStriped (vals);
409-
410- __syncthreads ();
411- for (int j = threadIdx.x ; j < BLOCK_ESTIMATE; j+=blockDim.x )
412- temp_storage.smem_qidx [j] = -1 ;
413-
414- __syncthreads ();
415-
416- if (threadIdx.x < 256 )
417- {
418- float q_interval = (1 .0f -(2 .0f *offset))/255 .0f ;
419- int local_idx = round (((offset+(threadIdx.x *q_interval))*(valid_items-1 )));
420- temp_storage.smem_qidx [local_idx] = threadIdx.x ;
421- }
422-
423- __syncthreads ();
424-
425- for (int i = threadIdx.x ; i < BLOCK_ESTIMATE; i+=blockDim.x )
426- {
427- if (temp_storage.smem_qidx [i] != -1 )
428- atomicAdd (&code[temp_storage.smem_qidx [i]], vals[i/THREADS_ESTIMATE]);
429- }
430- }
431- }
432-
433-
434349__launch_bounds__ (TH, 4 )
435350__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n)
436351{
@@ -2984,9 +2899,6 @@ template __global__ void kdequant_mm_int32_fp16<4, 512>(int *__restrict__ const
29842899template __device__ unsigned char dQuantize<0 >(float * smem_code, const float rand, float x);
29852900template __device__ unsigned char dQuantize<1 >(float * smem_code, const float rand, float x);
29862901
2987- template __global__ void kEstimateQuantiles (float *__restrict__ const A, float *code, const float offset, const float max_val, const int n);
2988- template __global__ void kEstimateQuantiles (half *__restrict__ const A, float *code, const float offset, const half max_val, const int n);
2989-
29902902#define MAKE_PreconditionOptimizer32bit1State (oname, gtype ) \
29912903template __global__ void kPreconditionOptimizer32bit1State <gtype, oname, 4096 , 8 >(gtype* g, gtype* p, \
29922904 float * state1, float *unorm, \
0 commit comments