Skip to content

Commit e2ddda3

Browse files
authored
Merge pull request #78 from MISHANMAURYA/remove-estimate-quantiles-histogram-scatter
Remove estimate quantiles histogram scatter
2 parents ca04bc5 + 7a17f2d commit e2ddda3

File tree

4 files changed

+0
-120
lines changed

4 files changed

+0
-120
lines changed

csrc/kernels.hip

Lines changed: 0 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -346,91 +346,6 @@ __device__ __forceinline__ unsigned char quantize_2D(float *__restrict__ quadran
346346
}
347347
}
348348

349-
__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n)
350-
{
351-
const int tid = threadIdx.x + (blockDim.x*blockIdx.x);
352-
const int numThreads = blockDim.x*gridDim.x;
353-
354-
for(int i = tid; i < n; i+=numThreads)
355-
{
356-
int idx = (index1[i]*maxidx1) + index2[i];
357-
atomicAdd(&histogram[idx], src[i]);
358-
}
359-
}
360-
361-
#define THREADS_ESTIMATE 512
362-
#define NUM_ESTIMATE 8
363-
#define BLOCK_ESTIMATE 4096
364-
365-
template<typename T>
366-
__launch_bounds__(THREADS_ESTIMATE, 1)
367-
__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n)
368-
{
369-
const int n_full = (BLOCK_ESTIMATE*(n/BLOCK_ESTIMATE)) + (n % BLOCK_ESTIMATE == 0 ? 0 : BLOCK_ESTIMATE);
370-
int valid_items = (blockIdx.x+1 == gridDim.x) ? n - (blockIdx.x*BLOCK_ESTIMATE) : BLOCK_ESTIMATE;
371-
const int base_idx = (blockIdx.x * BLOCK_ESTIMATE);
372-
const float reciprocal_num_blocks = 1.0f/(n < 4096 ? 1.0f : (n/BLOCK_ESTIMATE));
373-
374-
T vals[NUM_ESTIMATE];
375-
376-
typedef hipcub::BlockRadixSort<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::NullType, 4, true, hipcub::BLOCK_SCAN_RAKING> BlockRadixSort;
377-
typedef hipcub::BlockLoad<T, THREADS_ESTIMATE, NUM_ESTIMATE, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadFloat;
378-
379-
__shared__ union {
380-
typename LoadFloat::TempStorage loadf;
381-
typename BlockRadixSort::TempStorage sort;
382-
int smem_qidx[BLOCK_ESTIMATE];
383-
} temp_storage;
384-
385-
for (unsigned int i = base_idx; i < n_full; i += gridDim.x*BLOCK_ESTIMATE)
386-
{
387-
valid_items = n - i > BLOCK_ESTIMATE ? BLOCK_ESTIMATE : n - i;
388-
389-
// do not process half-blocks
390-
if(valid_items < BLOCK_ESTIMATE && n > BLOCK_ESTIMATE){ continue; }
391-
392-
#pragma unroll 4
393-
for(int j = 0; j < NUM_ESTIMATE; j++)
394-
vals[j] = max_val;
395-
396-
__syncthreads();
397-
LoadFloat(temp_storage.loadf).Load(&(A[i]), vals, valid_items);
398-
399-
#pragma unroll 4
400-
for(int j = 0; j < NUM_ESTIMATE; j++)
401-
vals[j] = ((float)vals[j]) * reciprocal_num_blocks;
402-
403-
404-
__syncthreads();
405-
// sort into striped pattern to mitigate bank conflicts
406-
// striped pattern index for thread 0 [0, 1024, 2048, 3096]
407-
// striped pattern index for thread 1 [1, 1025, 2049, 3097]
408-
BlockRadixSort(temp_storage.sort).SortBlockedToStriped(vals);
409-
410-
__syncthreads();
411-
for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x)
412-
temp_storage.smem_qidx[j] = -1;
413-
414-
__syncthreads();
415-
416-
if(threadIdx.x < 256)
417-
{
418-
float q_interval = (1.0f-(2.0f*offset))/255.0f;
419-
int local_idx = round(((offset+(threadIdx.x*q_interval))*(valid_items-1)));
420-
temp_storage.smem_qidx[local_idx] = threadIdx.x;
421-
}
422-
423-
__syncthreads();
424-
425-
for(int i = threadIdx.x; i < BLOCK_ESTIMATE; i+=blockDim.x)
426-
{
427-
if(temp_storage.smem_qidx[i] != -1)
428-
atomicAdd(&code[temp_storage.smem_qidx[i]], vals[i/THREADS_ESTIMATE]);
429-
}
430-
}
431-
}
432-
433-
434349
__launch_bounds__(TH, 4)
435350
__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n)
436351
{
@@ -2984,9 +2899,6 @@ template __global__ void kdequant_mm_int32_fp16<4, 512>(int *__restrict__ const
29842899
template __device__ unsigned char dQuantize<0>(float* smem_code, const float rand, float x);
29852900
template __device__ unsigned char dQuantize<1>(float* smem_code, const float rand, float x);
29862901

2987-
template __global__ void kEstimateQuantiles(float *__restrict__ const A, float *code, const float offset, const float max_val, const int n);
2988-
template __global__ void kEstimateQuantiles(half *__restrict__ const A, float *code, const float offset, const half max_val, const int n);
2989-
29902902
#define MAKE_PreconditionOptimizer32bit1State(oname, gtype) \
29912903
template __global__ void kPreconditionOptimizer32bit1State<gtype, oname, 4096, 8>(gtype* g, gtype* p, \
29922904
float* state1, float *unorm, \

csrc/kernels_hip.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@
1111
#ifndef kernels
1212
#define kernels
1313

14-
template <typename T>
15-
__global__ void
16-
kEstimateQuantiles(T* __restrict__ const A, float* code, const float offset, const T max_val, const int n);
17-
1814
__global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
1915
__global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);
2016

@@ -103,9 +99,6 @@ __global__ void kOptimizerStatic8bit1StateBlockwise(
10399
template <typename T, int BLOCK_SIZE, int NUM_VALS>
104100
__global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);
105101

106-
__global__ void
107-
kHistogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, const int maxidx1, const int n);
108-
109102
template <typename T, int SPMM_ITEMS, int BITS>
110103
__global__ void kspmm_coo_very_sparse_naive(
111104
int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,

csrc/ops.hip

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,6 @@ using namespace BinSearch;
2424
using std::cout;
2525
using std::endl;
2626

27-
void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
28-
{
29-
int threads = 512;
30-
int num_blocks = n/threads;
31-
num_blocks = n % threads == 0 ? num_blocks : num_blocks + 1;
32-
hipLaunchKernelGGL(( kHistogramScatterAdd2D), dim3(num_blocks), dim3(512), 0, 0, histogram, index1, index2, src, maxidx1, n);
33-
CUDA_CHECK_RETURN(hipPeekAtLastError());
34-
}
35-
36-
template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
37-
{
38-
int num_blocks = n/4096;
39-
num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;
40-
CUDA_CHECK_RETURN(hipMemset(code, 0, 256*sizeof(float)));
41-
hipLaunchKernelGGL(( kEstimateQuantiles<T>), dim3(num_blocks), dim3(512), 0, 0, A, code, offset, std::numeric_limits<T>::max(), n);
42-
CUDA_CHECK_RETURN(hipPeekAtLastError());
43-
}
44-
4527
void quantize(float *code, float *A, unsigned char *out, int n)
4628
{
4729
int num_blocks = n/1024;
@@ -752,9 +734,6 @@ template int igemmlt<32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, con
752734
template int igemmlt<8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
753735
template int igemmlt<8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
754736

755-
template void estimateQuantiles(half *A, float *code, float offset, int n);
756-
template void estimateQuantiles(float *A, float *code, float offset, int n);
757-
758737
template void quantizeBlockwise<half, 1, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
759738
template void quantizeBlockwise<half, 0, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
760739
template void quantizeBlockwise<half, 0, FP4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);

csrc/ops_hip.cuh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,6 @@ class ContextHipsparse {
124124
}
125125
};
126126

127-
template <typename T> void estimateQuantiles(T* A, float* code, float offset, int n);
128-
129127
void quantize(float* code, float* A, unsigned char* out, int n);
130128
void dequantize(float* code, unsigned char* A, float* out, int n, hipStream_t stream);
131129
template <typename T, int STOCHASTIC, int DATA_TYPE>
@@ -160,8 +158,6 @@ void optimizerStatic8bitBlockwise(
160158

161159
template <typename T> void percentileClipping(T* g, float* gnorm_vec, int step, const int n);
162160

163-
void histogramScatterAdd2D(float* histogram, int* index1, int* index2, float* src, int maxidx1, int n);
164-
165161
void gemmex(
166162
Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda,
167163
int ldb, int ldc

0 commit comments

Comments
 (0)