From c95fe923f6a8a8877abade8c825fb31938e669ab Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Fri, 5 Sep 2025 07:26:59 -0700
Subject: [PATCH 01/10] q2K - accuracy mismatches

---
 ggml/src/ggml-cpu/repack.cpp | 172 ++++++++++++++++++++++++++++++++++-
 ggml/src/ggml-quants.c       |  16 +++-
 src/whisper.cpp              |  11 ++-
 3 files changed, 189 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index f531d21e232..d53d206ecd1 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -21,7 +21,7 @@
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #endif
-
+#include <stdio.h>
 #define UNUSED GGML_UNUSED
 
 static inline int nearest_int(float fval) {
@@ -1576,6 +1576,11 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 
                     size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
 
+                    return true;
+                }
+            case GGML_OP_GET_ROWS:
+                {
+                    size = 0;  // GET_ROWS (standard and repacked) doesn't need a work buffer
                     return true;
                 }
             default:
@@ -1593,6 +1598,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             case GGML_OP_MUL_MAT_ID:
                 forward_mul_mat_id(params, op);
                 return true;
+            case GGML_OP_GET_ROWS:
+                forward_get_rows(params, op);
+                return true;
             default:
                 // GGML_ABORT("fatal error");
                 break;
@@ -1801,6 +1809,155 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 #undef MMID_MATRIX_ROW
     }
 
+    void forward_get_rows(const ggml_compute_params * params,
+                          ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+
+        switch (src0->type) {
+            case GGML_TYPE_Q2_K:
+                ggml_compute_forward_get_rows_q2_Kx8(params, dst);
+                break;
+            default:
+                GGML_ABORT("fatal error");
+                break;
+        }
+    }
+
+    static void ggml_compute_forward_get_rows_q2_Kx8(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+        const int64_t nc = ne00;
+        const int64_t nr = ggml_nelements(src1);
+
+        assert(ne0 == nc);
+        assert(ne02 == ne11);
+        assert(nb00 == ggml_type_size(src0->type));
+        assert(ggml_nrows(dst) == nr);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        // rows per thread
+        const int dr = (nr + nth - 1) / nth;
+
+        // row range for this thread
+        const int ir0 = dr * ith;
+        const int ir1 = MIN(ir0 + dr, nr);
+
+        constexpr int nrows_interleaved = 8;
+        const size_t sizeof_one_repacked_block = sizeof(block_q2_Kx8);
+
+        const int num_repacked_blocks_per_row_width = nc / QK_K;
+
+        const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block;
+
+        for (int64_t i = ir0; i < ir1; ++i) {
+            const int64_t i12 = i / (ne11 * ne10);
+            const int64_t i11 = (i - i12 * ne11 * ne10) / ne10;
+            const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10);
+            const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12);  // original logical row
+
+            GGML_ASSERT(i01 >= 0 && i01 < ne01);
+
+            const int row_group_idx = i01 / nrows_interleaved;
+            const int row_idx_in_group = i01 % nrows_interleaved;
+
+            const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03;
+
+            // Pointer to the first block_q2_Kx8 of the identified row_group_idx
+            const block_q2_Kx8 * p_first_repacked_block_of_group_x8 = (const block_q2_Kx8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups);
+
+            dequantize_row_q2_Kx8(
+                p_first_repacked_block_of_group_x8,
+                (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group);
+        }
+    }
+
+    /**
+     * Dequantizes a single logical row from the repacked q2_Kx8 data format.
+     *
+     * @param p_repacked_blocks Pointer to the start of the 'block_q_Kx8' structures for the entire row.
+     * @param y                 Output buffer for the dequantized float values.
+     * @param k                 Total number of elements (columns) in the logical row.
+     * @param row_idx_in_group  The index (0-7) of the logical row to extract from the interleaved data.
+     */
+
+    static void dequantize_row_q2_Kx8(
+        const void * GGML_RESTRICT p_repacked_blocks,
+        float * GGML_RESTRICT y,
+        int64_t k,
+        int row_idx_in_group) {
+        assert(k % QK_K == 0);
+        assert(row_idx_in_group >= 0 && row_idx_in_group < 8);
+
+        const int nb = k / QK_K;
+        const block_q2_Kx8 * blocks = (const block_q2_Kx8 *)p_repacked_blocks;
+        int out_pos = 0;
+        fprintf(stderr, "\n Inside deq");
+        for (int i = 0; i < nb; i++) {
+            const block_q2_Kx8 * current_block = &blocks[i];
+
+            const float d_super_block = GGML_FP16_TO_FP32(current_block->d[row_idx_in_group]);
+            const float dmin_super_block = GGML_FP16_TO_FP32(current_block->dmin[row_idx_in_group]);
+
+            const uint8_t * ptr_qs_base = current_block->qs;
+
+            uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales; // 16*8 scales repacked - 2bytes of each super block stored together
+            float dl, ml;
+            int is = 0;
+            fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block);
+            for (int n = 0; n < QK_K; n += 128) {
+                int shift = 0;
+                for (int j = 0; j < 4; ++j) {
+
+                    // get the scales needed for the 32 values to be dequantized
+                    const int8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
+                    dl = d_super_block * (sc0 & 0xF); 
+                    ml = dmin_super_block * (sc0 >> 4);
+
+                    for (int l = 0; l < 16; ++l) {
+                        float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml;
+                        *y++ = v;
+                        fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                    }
+
+                    const int8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
+                    dl = d_super_block * (sc1 & 0xF); 
+                    ml = dmin_super_block * (sc1 >> 4);
+
+                    for (int l = 0; l < 16; ++l) {
+                        float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml;
+                        *y++ = v;
+                        fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                    }
+
+                    shift +=2;
+                }
+                // ptr_qs_base += 32*8;
+                // ptr_repacked_scales = (uint8_t *)current_block->scales + 64; 
+            }
+        }
+    }
+
+    static inline uint8_t read_scale_from_repacked(const uint8_t* ptr_repacked_scales, int row_idx_in_group, int scale_idx) {
+        const int pair_group_idx = scale_idx / 2;
+        const int sub_idx_in_pair = scale_idx % 2;
+        const int offset = pair_group_idx * 16 + row_idx_in_group * 2 + sub_idx_in_pair;
+        return ptr_repacked_scales[offset];
+    }
+
+    static inline uint8_t read_q_from_repacked(const uint8_t* ptr_q_base, int row_idx_in_group, int q_idx) {
+        const int block_size_interleave  = 8;
+        const int chunk_idx = q_idx / block_size_interleave;
+        const int offset_in_chunk = q_idx % block_size_interleave;
+        const int offset = chunk_idx * (8 * block_size_interleave) + row_idx_in_group * block_size_interleave + offset_in_chunk;
+        return ptr_q_base[offset];
+    }
+
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
@@ -1949,12 +2106,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
             //if (op->src[1]->type == GGML_TYPE_Q8_0) {
             //    return true;
             //}
+        } else if (op->op == GGML_OP_GET_ROWS
+        && op->src[0]->buffer
+        && (ggml_n_dims(op->src[0]) == 2)
+        && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
+        && ggml_repack_get_optimal_repack_type(op->src[0])) {
+        if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
+            return false;
+        }
+        if (op->src[0]->type == GGML_TYPE_Q2_K) {
+            return true;
+        }
         }
         return false;
     }
 
     ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
-        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
+        if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) {
             if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
                 return (ggml::cpu::tensor_traits *) op->src[0]->extra;
             }
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 727932123e4..3eba93a44c2 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -784,7 +784,8 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST
 void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
-
+    int out_pos = 0;
+    fprintf(stderr, "\n Inside deq");
     for (int i = 0; i < nb; i++) {
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
@@ -794,17 +795,26 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
 
         int is = 0;
         float dl, ml;
+        fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min);
         for (int n = 0; n < QK_K; n += 128) {
             int shift = 0;
             for (int j = 0; j < 4; ++j) {
 
                 uint8_t sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
+                for (int l = 0; l < 16; ++l) {
+                    float v = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
+                    *y++ = v;
+                    fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                }
 
                 sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+                for (int l = 0; l < 16; ++l) {
+                    float v = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+                    *y++ = v;
+                    fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                }
 
                 shift += 2;
             }
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 52de68c2b12..9cfa969e05d 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1439,7 +1439,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
     } else {
         switch (op) {
             // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS
-            case GGML_OP_GET_ROWS:
+            // case GGML_OP_GET_ROWS:
             case GGML_OP_MUL_MAT: {
                 ggml_init_params params = {
                     /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
@@ -1459,11 +1459,12 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
                     int64_t n_ctx = hparams.n_audio_ctx;
                     ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                     op_tensor = ggml_mul_mat(ctx, w, b);
-                } else if (op == GGML_OP_GET_ROWS) {
-                    int64_t num_indices = 8;
-                    ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
-                    op_tensor = ggml_get_rows(ctx, w, indices);
                 }
+                //  else if (op == GGML_OP_GET_ROWS) {
+                //     int64_t num_indices = 8;
+                //     ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
+                //     op_tensor = ggml_get_rows(ctx, w, indices);
+                // }
 
                 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
                 GGML_ASSERT(w->buffer == nullptr);

From c2a4a415d2002b4ea81c578b7d4c8200d69ebf2f Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Mon, 8 Sep 2025 03:20:03 -0700
Subject: [PATCH 02/10] Fix accuracy issues

---
 ggml/src/ggml-cpu/repack.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index d53d206ecd1..54beab594e2 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1915,7 +1915,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                 for (int j = 0; j < 4; ++j) {
 
                     // get the scales needed for the 32 values to be dequantized
-                    const int8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
+                    const uint8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
+                    fprintf(stderr, "scale sc0 =%d ", sc0);
                     dl = d_super_block * (sc0 & 0xF); 
                     ml = dmin_super_block * (sc0 >> 4);
 
@@ -1925,7 +1926,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                         fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
-                    const int8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
+                    const uint8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
+                    fprintf(stderr, "scale s10 =%d ", sc1);
                     dl = d_super_block * (sc1 & 0xF); 
                     ml = dmin_super_block * (sc1 >> 4);
 

From 48a69ea5084533bd3ce03ddbcf3bcbb11c35dbbb Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Mon, 8 Sep 2025 03:57:48 -0700
Subject: [PATCH 03/10] q2K fixes

---
 src/whisper.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 9cfa969e05d..52de68c2b12 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -1439,7 +1439,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
     } else {
         switch (op) {
             // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS
-            // case GGML_OP_GET_ROWS:
+            case GGML_OP_GET_ROWS:
             case GGML_OP_MUL_MAT: {
                 ggml_init_params params = {
                     /*.mem_size   =*/ 2 * ggml_tensor_overhead(),
@@ -1459,12 +1459,11 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor *
                     int64_t n_ctx = hparams.n_audio_ctx;
                     ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]);
                     op_tensor = ggml_mul_mat(ctx, w, b);
+                } else if (op == GGML_OP_GET_ROWS) {
+                    int64_t num_indices = 8;
+                    ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
+                    op_tensor = ggml_get_rows(ctx, w, indices);
                 }
-                //  else if (op == GGML_OP_GET_ROWS) {
-                //     int64_t num_indices = 8;
-                //     ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices);
-                //     op_tensor = ggml_get_rows(ctx, w, indices);
-                // }
 
                 // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
                 GGML_ASSERT(w->buffer == nullptr);

From 15506ec0c6c92071811c2557f2fda10d6ee2406f Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Mon, 8 Sep 2025 05:41:44 -0700
Subject: [PATCH 04/10] Comment out the print statements

---
 ggml/src/ggml-cpu/repack.cpp | 12 ++++++------
 ggml/src/ggml-quants.c       | 10 ++++++----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 54beab594e2..2dc1cd3cc46 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1897,7 +1897,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const int nb = k / QK_K;
         const block_q2_Kx8 * blocks = (const block_q2_Kx8 *)p_repacked_blocks;
         int out_pos = 0;
-        fprintf(stderr, "\n Inside deq");
+        // fprintf(stderr, "\n Inside deq");
         for (int i = 0; i < nb; i++) {
             const block_q2_Kx8 * current_block = &blocks[i];
 
@@ -1909,32 +1909,32 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
             uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales; // 16*8 scales repacked - 2bytes of each super block stored together
             float dl, ml;
             int is = 0;
-            fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block);
+            // fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block);
             for (int n = 0; n < QK_K; n += 128) {
                 int shift = 0;
                 for (int j = 0; j < 4; ++j) {
 
                     // get the scales needed for the 32 values to be dequantized
                     const uint8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
-                    fprintf(stderr, "scale sc0 =%d ", sc0);
+                    // fprintf(stderr, "scale sc0 =%d ", sc0);
                     dl = d_super_block * (sc0 & 0xF); 
                     ml = dmin_super_block * (sc0 >> 4);
 
                     for (int l = 0; l < 16; ++l) {
                         float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml;
                         *y++ = v;
-                        fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                        // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
                     const uint8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
-                    fprintf(stderr, "scale s10 =%d ", sc1);
+                    // fprintf(stderr, "scale s10 =%d ", sc1);
                     dl = d_super_block * (sc1 & 0xF); 
                     ml = dmin_super_block * (sc1 >> 4);
 
                     for (int l = 0; l < 16; ++l) {
                         float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml;
                         *y++ = v;
-                        fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                        // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
                     shift +=2;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 3eba93a44c2..865870adc3e 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -785,7 +785,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
     int out_pos = 0;
-    fprintf(stderr, "\n Inside deq");
+    // fprintf(stderr, "\n Inside deq");
     for (int i = 0; i < nb; i++) {
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
@@ -795,25 +795,27 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
 
         int is = 0;
         float dl, ml;
-        fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min);
+        // fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min);
         for (int n = 0; n < QK_K; n += 128) {
             int shift = 0;
             for (int j = 0; j < 4; ++j) {
 
                 uint8_t sc = x[i].scales[is++];
+                // fprintf(stderr, "scale sc0 =%d ", sc);
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
                 for (int l = 0; l < 16; ++l) {
                     float v = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
                     *y++ = v;
-                    fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                    // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                 }
 
                 sc = x[i].scales[is++];
+                // fprintf(stderr, "scale sc1 =%d ", sc);
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
                 for (int l = 0; l < 16; ++l) {
                     float v = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
                     *y++ = v;
-                    fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
+                    // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                 }
 
                 shift += 2;

From 6baca0df7349e22caf7c567ec11390eaca049f0a Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Mon, 8 Sep 2025 05:44:22 -0700
Subject: [PATCH 05/10] Minor changes

---
 ggml/src/ggml-cpu/repack.cpp | 6 ++----
 ggml/src/ggml-quants.c       | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index 2dc1cd3cc46..dc6d898b7f1 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1921,8 +1921,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                     ml = dmin_super_block * (sc0 >> 4);
 
                     for (int l = 0; l < 16; ++l) {
-                        float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml;
-                        *y++ = v;
+                        *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml;
                         // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
@@ -1932,8 +1931,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
                     ml = dmin_super_block * (sc1 >> 4);
 
                     for (int l = 0; l < 16; ++l) {
-                        float v = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml;
-                        *y++ = v;
+                        *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml;
                         // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 865870adc3e..b8fce389918 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -804,8 +804,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
                 // fprintf(stderr, "scale sc0 =%d ", sc);
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
                 for (int l = 0; l < 16; ++l) {
-                    float v = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
-                    *y++ = v;
+                    *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
                     // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                 }
 
@@ -813,8 +812,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
                 // fprintf(stderr, "scale sc1 =%d ", sc);
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
                 for (int l = 0; l < 16; ++l) {
-                    float v = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
-                    *y++ = v;
+                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
                     // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                 }
 

From 16974edcadaa8ffb5532b4a431ae67259a433bfb Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Mon, 8 Sep 2025 23:22:06 -0700
Subject: [PATCH 06/10] Clean up the code

---
 ggml/src/ggml-cpu/repack.cpp | 12 +-----------
 ggml/src/ggml-quants.c       | 13 ++-----------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index dc6d898b7f1..a005d13d5e6 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1897,7 +1897,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
         const int nb = k / QK_K;
         const block_q2_Kx8 * blocks = (const block_q2_Kx8 *)p_repacked_blocks;
         int out_pos = 0;
-        // fprintf(stderr, "\n Inside deq");
         for (int i = 0; i < nb; i++) {
             const block_q2_Kx8 * current_block = &blocks[i];
 
@@ -1906,39 +1905,30 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
 
             const uint8_t * ptr_qs_base = current_block->qs;
 
-            uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales; // 16*8 scales repacked - 2bytes of each super block stored together
+            uint8_t * ptr_repacked_scales = (uint8_t *)current_block->scales;
             float dl, ml;
             int is = 0;
-            // fprintf(stderr, "[Q2Kx8] blk=%d row=%d d=%g dmin=%g\n", i, row_idx_in_group, d_super_block, dmin_super_block);
             for (int n = 0; n < QK_K; n += 128) {
                 int shift = 0;
                 for (int j = 0; j < 4; ++j) {
-
-                    // get the scales needed for the 32 values to be dequantized
                     const uint8_t sc0 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
-                    // fprintf(stderr, "scale sc0 =%d ", sc0);
                     dl = d_super_block * (sc0 & 0xF); 
                     ml = dmin_super_block * (sc0 >> 4);
 
                     for (int l = 0; l < 16; ++l) {
                         *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l) >> shift) & 3)) - ml;
-                        // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
                     const uint8_t sc1 = read_scale_from_repacked(ptr_repacked_scales, row_idx_in_group, is++);
-                    // fprintf(stderr, "scale s10 =%d ", sc1);
                     dl = d_super_block * (sc1 & 0xF); 
                     ml = dmin_super_block * (sc1 >> 4);
 
                     for (int l = 0; l < 16; ++l) {
                         *y++ = dl * ((int8_t)((read_q_from_repacked(ptr_qs_base, row_idx_in_group, n/4 + l + 16) >> shift) & 3)) - ml;
-                        // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
                     }
 
                     shift +=2;
                 }
-                // ptr_qs_base += 32*8;
-                // ptr_repacked_scales = (uint8_t *)current_block->scales + 64; 
             }
         }
     }
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index b8fce389918..91b8b95ba09 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -785,7 +785,6 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
     int out_pos = 0;
-    // fprintf(stderr, "\n Inside deq");
     for (int i = 0; i < nb; i++) {
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
@@ -795,27 +794,19 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
 
         int is = 0;
         float dl, ml;
-        // fprintf(stderr, "[Q2Kx8] blk=%d d=%g dmin=%g\n", i, d, min);
         for (int n = 0; n < QK_K; n += 128) {
             int shift = 0;
             for (int j = 0; j < 4; ++j) {
 
                 uint8_t sc = x[i].scales[is++];
-                // fprintf(stderr, "scale sc0 =%d ", sc);
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) {
+                for (int l = 0; l < 16; ++l)
                     *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
-                    // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
-                }
 
                 sc = x[i].scales[is++];
-                // fprintf(stderr, "scale sc1 =%d ", sc);
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l) {
+                for (int l = 0; l < 16; ++l)
                     *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
-                    // fprintf(stderr, "y[%d] = %.8f\n", out_pos++, v);
-                }
-
                 shift += 2;
             }
             q += 32;

From 2cb81b31413d4314dd7fa6c88fe173cf7832b11c Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 9 Sep 2025 00:32:29 -0700
Subject: [PATCH 07/10] Clean up code

---
 ggml/src/ggml-quants.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 91b8b95ba09..dc260979c5b 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -784,7 +784,7 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST
 void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
-    int out_pos = 0;
+
     for (int i = 0; i < nb; i++) {
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
@@ -800,13 +800,11 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
 
                 uint8_t sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l)
-                    *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
 
                 sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l)
-                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
                 shift += 2;
             }
             q += 32;

From 74c193a18c6bd3f38afc00af82bf4b5cb6be7633 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 9 Sep 2025 00:32:29 -0700
Subject: [PATCH 08/10] Clean up code

---
 ggml/src/ggml-quants.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 91b8b95ba09..dc260979c5b 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -784,7 +784,7 @@ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_REST
 void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
-    int out_pos = 0;
+
     for (int i = 0; i < nb; i++) {
 
         const float d = GGML_FP16_TO_FP32(x[i].d);
@@ -800,13 +800,11 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
 
                 uint8_t sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l)
-                    *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
 
                 sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
-                for (int l = 0; l < 16; ++l)
-                    *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+                for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
                 shift += 2;
             }
             q += 32;

From 0d5eec91a29309682a45b354713fa6164f1ff0fd Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 9 Sep 2025 00:34:10 -0700
Subject: [PATCH 09/10] Code clean up

---
 ggml/src/ggml-quants.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index dc260979c5b..727932123e4 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -805,6 +805,7 @@ void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRI
                 sc = x[i].scales[is++];
                 dl = d * (sc & 0xF); ml = min * (sc >> 4);
                 for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
+
                 shift += 2;
             }
             q += 32;

From 75676bda6ace4e58bc43c62f2d76c914e6beff21 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Tue, 9 Sep 2025 01:59:02 -0700
Subject: [PATCH 10/10] Remove unnecessary headers

---
 ggml/src/ggml-cpu/repack.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index a005d13d5e6..0819119f057 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -21,7 +21,6 @@
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #endif
-#include <stdio.h>
 #define UNUSED GGML_UNUSED
 
 static inline int nearest_int(float fval) {