From c95fe923f6a8a8877abade8c825fb31938e669ab Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Fri, 5 Sep 2025 07:26:59 -0700 Subject: [PATCH 01/10] q2K - accuracy mismatches --- ggml/src/ggml-cpu/repack.cpp | 172 ++++++++++++++++++++++++++++++++++- ggml/src/ggml-quants.c | 16 +++- src/whisper.cpp | 11 ++- 3 files changed, 189 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index f531d21e232..d53d206ecd1 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -21,7 +21,7 @@ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" #endif - +#include #define UNUSED GGML_UNUSED static inline int nearest_int(float fval) { @@ -1576,6 +1576,11 @@ template src[0]; + + switch (src0->type) { + case GGML_TYPE_Q2_K: + ggml_compute_forward_get_rows_q2_Kx8(params, dst); + break; + default: + GGML_ABORT("fatal error"); + break; + } + } + + static void ggml_compute_forward_get_rows_q2_Kx8( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(src0->type)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1) / nth; + + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + constexpr int nrows_interleaved = 8; + const size_t sizeof_one_repacked_block = sizeof(block_q2_Kx8); + + const int num_repacked_blocks_per_row_width = nc / QK_K; + + const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block; + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i / (ne11 * ne10); + const int64_t i11 = (i - i12 * ne11 * ne10) / ne10; + const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10); + const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); // original logical row + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + const int row_group_idx = i01 / nrows_interleaved; + const int row_idx_in_group = i01 % nrows_interleaved; + + const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03; + + // Pointer to the first block_q2_Kx8 of the identified row_group_idx + const block_q2_Kx8 * p_first_repacked_block_of_group_x8 = (const block_q2_Kx8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); + + dequantize_row_q2_Kx8( + p_first_repacked_block_of_group_x8, + (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group); + } + } + + /** + * Dequantizes a single logical row from the repacked q2_Kx8 data format. + * + * @param p_repacked_blocks Pointer to the start of the 'block_q_Kx8' structures for the entire row. + * @param y Output buffer for the dequantized float values. + * @param k Total number of elements (columns) in the logical row. + * @param row_idx_in_group The index (0-7) of the logical row to extract from the interleaved data. + */ + + static void dequantize_row_q2_Kx8( + const void * GGML_RESTRICT p_repacked_blocks, + float * GGML_RESTRICT y, + int64_t k, + int row_idx_in_group) { + assert(k % QK_K == 0); + assert(row_idx_in_group >= 0 && row_idx_in_group < 8); + + const int nb = k / QK_K; + const block_q2_Kx8 * blocks = (const block_q2_Kx8 *)p_repacked_blocks; + int out_pos = 0; + fprintf(stderr, "\n Inside deq"); + for (int i = 0; i < nb; i++) { + const block_q2_Kx8 * current_block = &blocks[i]; + + const float d_super_block = GGML_FP16_TO_FP32(current_block->d[row_idx_in_group]); + const float dmin_super_block = GGML_FP16_TO_FP32(current_block->dmin[row_idx_in_group]); + + const uint8_t * ptr_qs_base = current_block->qs; + + uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales; // 16*8 scales repacked - 2bytes of each super block stored together + float dl, ml; + int is = 0; + fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block); + for (int n = 0; n < QK_K; n += 128) { + int shift = 0; + for (int j = 0; j < 4; ++j) { + + // get the scales needed for the 32 values to be dequantized + const int8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++); + dl = d_super_block * (sc0 & 0xF); + ml = dmin_super_block * (sc0 >> 4); + + for (int l = 0; l < 16; ++l) { + float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml; + *y++ = v; + fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + } + + const int8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++); + dl = d_super_block * (sc1 & 0xF); + ml = dmin_super_block * (sc1 >> 4); + + for (int l = 0; l < 16; ++l) { + float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml; + *y++ = v; + fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + } + + shift +=2; + } + // ptr_qs_base += 32*8; + // ptr_repacked_scales = (uint8_t *)current_block->scales + 64; + } + } + } + + static inline uint8_t read_scale_from_repacked(const uint8_t* ptr_repacked_scales, int row_idx_in_group, int scale_idx) { + const int pair_group_idx = scale_idx / 2; + const int sub_idx_in_pair = scale_idx % 2; + const int offset = pair_group_idx * 16 + row_idx_in_group * 2 + sub_idx_in_pair; + return ptr_repacked_scales[offset]; + } + + static inline uint8_t read_q_from_repacked(const uint8_t* ptr_q_base, int row_idx_in_group, int q_idx) { + const int block_size_interleave = 8; + const int chunk_idx = q_idx / block_size_interleave; + const int offset_in_chunk = q_idx % block_size_interleave; + const int offset = chunk_idx * (8 * block_size_interleave) + row_idx_in_group * block_size_interleave + offset_in_chunk; + return ptr_q_base[offset]; + } + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), (int) NB_COLS, (int) INTER_SIZE); @@ -1949,12 +2106,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { //if (op->src[1]->type == GGML_TYPE_Q8_0) { // return true; //} + } else if (op->op == GGML_OP_GET_ROWS + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 2) + && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() + && ggml_repack_get_optimal_repack_type(op->src[0])) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[0]->type == GGML_TYPE_Q2_K) { + return true; + } } return false; } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 727932123e4..3eba93a44c2 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -784,7 +784,8 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; - + int out_pos = 0; + fprintf(stderr, "\n Inside deq"); for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -794,17 +795,26 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI int is = 0; float dl, ml; + fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min); for (int n = 0; n < QK_K; n += 128) { int shift = 0; for (int j = 0; j < 4; ++j) { uint8_t sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + for (int l = 0; l < 16; ++l) { + float v = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + *y++ = v; + fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + } sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + for (int l = 0; l < 16; ++l) { + float v = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + *y++ = v; + fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + } shift += 2; } diff --git a/src/whisper.cpp b/src/whisper.cpp index 52de68c2b12..9cfa969e05d 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1439,7 +1439,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * } else { switch (op) { // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS - case GGML_OP_GET_ROWS: + // case GGML_OP_GET_ROWS: case GGML_OP_MUL_MAT: { ggml_init_params params = { /*.mem_size =*/ 2 * ggml_tensor_overhead(), @@ -1459,11 +1459,12 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * int64_t n_ctx = hparams.n_audio_ctx; ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); - } else if (op == GGML_OP_GET_ROWS) { - int64_t num_indices = 8; - ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices); - op_tensor = ggml_get_rows(ctx, w, indices); } + // else if (op == GGML_OP_GET_ROWS) { + // int64_t num_indices = 8; + // ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices); + // op_tensor = ggml_get_rows(ctx, w, indices); + // } // create a temporary dummy buffer for the weight so that supports_op can check the buffer type GGML_ASSERT(w->buffer == nullptr); From c2a4a415d2002b4ea81c578b7d4c8200d69ebf2f Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Mon, 8 Sep 2025 03:20:03 -0700 Subject: [PATCH 02/10] Fix accuracy issues --- ggml/src/ggml-cpu/repack.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index d53d206ecd1..54beab594e2 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1915,7 +1915,8 @@ template > 4); @@ -1925,7 +1926,8 @@ template > 4); From 48a69ea5084533bd3ce03ddbcf3bcbb11c35dbbb Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Mon, 8 Sep 2025 03:57:48 -0700 Subject: [PATCH 03/10] q2K fixes --- src/whisper.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 9cfa969e05d..52de68c2b12 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1439,7 +1439,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * } else { switch (op) { // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS - // case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS: case GGML_OP_MUL_MAT: { ggml_init_params params = { /*.mem_size =*/ 2 * ggml_tensor_overhead(), @@ -1459,12 +1459,11 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * int64_t n_ctx = hparams.n_audio_ctx; ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); + } else if (op == GGML_OP_GET_ROWS) { + int64_t num_indices = 8; + ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices); + op_tensor = ggml_get_rows(ctx, w, indices); } - // else if (op == GGML_OP_GET_ROWS) { - // int64_t num_indices = 8; - // ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices); - // op_tensor = ggml_get_rows(ctx, w, indices); - // } // create a temporary dummy buffer for the weight so that supports_op can check the buffer type GGML_ASSERT(w->buffer == nullptr); From 15506ec0c6c92071811c2557f2fda10d6ee2406f Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Mon, 8 Sep 2025 05:41:44 -0700 Subject: [PATCH 04/10] Comment out the print statements --- ggml/src/ggml-cpu/repack.cpp | 12 ++++++------ ggml/src/ggml-quants.c | 10 ++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 54beab594e2..2dc1cd3cc46 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1897,7 +1897,7 @@ template scales; // 16*8 scales repacked - 2bytes of each super block stored together float dl, ml; int is = 0; - fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block); + // fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block); for (int n = 0; n < QK_K; n += 128) { int shift = 0; for (int j = 0; j < 4; ++j) { // get the scales needed for the 32 values to be dequantized const uint8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++); - fprintf(stderr, "scale sc0 =%d ", sc0); + // fprintf(stderr, "scale sc0 =%d ", sc0); dl = d_super_block * (sc0 & 0xF); ml = dmin_super_block * (sc0 >> 4); for (int l = 0; l < 16; ++l) { float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml; *y++ = v; - fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } const uint8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++); - fprintf(stderr, "scale s10 =%d ", sc1); + // fprintf(stderr, "scale s10 =%d ", sc1); dl = d_super_block * (sc1 & 0xF); ml = dmin_super_block * (sc1 >> 4); for (int l = 0; l < 16; ++l) { float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml; *y++ = v; - fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } shift +=2; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 3eba93a44c2..865870adc3e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -785,7 +785,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI assert(k % QK_K == 0); const int nb = k / QK_K; int out_pos = 0; - fprintf(stderr, "\n Inside deq"); + // fprintf(stderr, "\n Inside deq"); for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -795,25 +795,27 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI int is = 0; float dl, ml; - fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min); + // fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min); for (int n = 0; n < QK_K; n += 128) { int shift = 0; for (int j = 0; j < 4; ++j) { uint8_t sc = x[i].scales[is++]; + // fprintf(stderr, "scale sc0 =%d ", sc); dl = d * (sc & 0xF); ml = min * (sc >> 4); for (int l = 0; l < 16; ++l) { float v = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; *y++ = v; - fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } sc = x[i].scales[is++]; + // fprintf(stderr, "scale sc1 =%d ", sc); dl = d * (sc & 0xF); ml = min * (sc >> 4); for (int l = 0; l < 16; ++l) { float v = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; *y++ = v; - fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); + // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } shift += 2; From 6baca0df7349e22caf7c567ec11390eaca049f0a Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Mon, 8 Sep 2025 05:44:22 -0700 Subject: [PATCH 05/10] Minor changes --- ggml/src/ggml-cpu/repack.cpp | 6 ++---- ggml/src/ggml-quants.c | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 2dc1cd3cc46..dc6d898b7f1 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1921,8 +1921,7 @@ template > 4); for (int l = 0; l < 16; ++l) { - float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml; - *y++ = v; + *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml; // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } @@ -1932,8 +1931,7 @@ template > 4); for (int l = 0; l < 16; ++l) { - float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml; - *y++ = v; + *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml; // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 865870adc3e..b8fce389918 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -804,8 +804,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI // fprintf(stderr, "scale sc0 =%d ", sc); dl = d * (sc & 0xF); ml = min * (sc >> 4); for (int l = 0; l < 16; ++l) { - float v = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; - *y++ = v; + *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } @@ -813,8 +812,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI // fprintf(stderr, "scale sc1 =%d ", sc); dl = d * (sc & 0xF); ml = min * (sc >> 4); for (int l = 0; l < 16; ++l) { - float v = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; - *y++ = v; + *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } From 16974edcadaa8ffb5532b4a431ae67259a433bfb Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Mon, 8 Sep 2025 23:22:06 -0700 Subject: [PATCH 06/10] Clean up the code --- ggml/src/ggml-cpu/repack.cpp | 12 +----------- ggml/src/ggml-quants.c | 13 ++----------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index dc6d898b7f1..a005d13d5e6 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1897,7 +1897,6 @@ template qs; - uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales; // 16*8 scales repacked - 2bytes of each super block stored together + uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales; float dl, ml; int is = 0; - // fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block); for (int n = 0; n < QK_K; n += 128) { int shift = 0; for (int j = 0; j < 4; ++j) { - - // get the scales needed for the 32 values to be dequantized const uint8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++); - // fprintf(stderr, "scale sc0 =%d ", sc0); dl = d_super_block * (sc0 & 0xF); ml = dmin_super_block * (sc0 >> 4); for (int l = 0; l < 16; ++l) { *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml; - // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } const uint8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++); - // fprintf(stderr, "scale s10 =%d ", sc1); dl = d_super_block * (sc1 & 0xF); ml = dmin_super_block * (sc1 >> 4); for (int l = 0; l < 16; ++l) { *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml; - // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); } shift +=2; } - // ptr_qs_base += 32*8; - // ptr_repacked_scales = (uint8_t *)current_block->scales + 64; } } } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index b8fce389918..91b8b95ba09 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -785,7 +785,6 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI assert(k % QK_K == 0); const int nb = k / QK_K; int out_pos = 0; - // fprintf(stderr, "\n Inside deq"); for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -795,27 +794,19 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI int is = 0; float dl, ml; - // fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min); for (int n = 0; n < QK_K; n += 128) { int shift = 0; for (int j = 0; j < 4; ++j) { uint8_t sc = x[i].scales[is++]; - // fprintf(stderr, "scale sc0 =%d ", sc); dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) { + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; - // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); - } sc = x[i].scales[is++]; - // fprintf(stderr, "scale sc1 =%d ", sc); dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) { + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; - // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v); - } - shift += 2; } q += 32; From 2cb81b31413d4314dd7fa6c88fe173cf7832b11c Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 9 Sep 2025 00:32:29 -0700 Subject: [PATCH 07/10] Clean up code --- ggml/src/ggml-quants.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 91b8b95ba09..dc260979c5b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -784,7 +784,7 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; - int out_pos = 0; + for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -800,13 +800,11 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI uint8_t sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) - *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) - *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; shift += 2; } q += 32; From 74c193a18c6bd3f38afc00af82bf4b5cb6be7633 Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 9 Sep 2025 00:32:29 -0700 Subject: [PATCH 08/10] Clean up code --- ggml/src/ggml-quants.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 91b8b95ba09..dc260979c5b 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -784,7 +784,7 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; - int out_pos = 0; + for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); @@ -800,13 +800,11 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI uint8_t sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) - *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) - *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; shift += 2; } q += 32; From 0d5eec91a29309682a45b354713fa6164f1ff0fd Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 9 Sep 2025 00:34:10 -0700 Subject: [PATCH 09/10] Code clean up --- ggml/src/ggml-quants.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index dc260979c5b..727932123e4 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -805,6 +805,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI sc = x[i].scales[is++]; dl = d * (sc & 0xF); ml = min * (sc >> 4); for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; + shift += 2; } q += 32; From 75676bda6ace4e58bc43c62f2d76c914e6beff21 Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 9 Sep 2025 01:59:02 -0700 Subject: [PATCH 10/10] Remove unnecessary headers --- ggml/src/ggml-cpu/repack.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index a005d13d5e6..0819119f057 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -21,7 +21,6 @@ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" #endif -#include #define UNUSED GGML_UNUSED static inline int nearest_int(float fval) {