Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 38 additions & 38 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,45 +386,44 @@ extern "C" {

// NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type {
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_IQ2_XXS = 16,
GGML_TYPE_IQ2_XS = 17,
GGML_TYPE_IQ3_XXS = 18,
GGML_TYPE_IQ1_S = 19,
GGML_TYPE_IQ4_NL = 20,
GGML_TYPE_IQ3_S = 21,
GGML_TYPE_IQ2_S = 22,
GGML_TYPE_IQ4_XS = 23,
GGML_TYPE_I8 = 24,
GGML_TYPE_I16 = 25,
GGML_TYPE_I32 = 26,
GGML_TYPE_I64 = 27,
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_MXFP4 = 39, // so we are compatible with mainline
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_IQ2_XXS = 16,
GGML_TYPE_IQ2_XS = 17,
GGML_TYPE_IQ3_XXS = 18,
GGML_TYPE_IQ1_S = 19,
GGML_TYPE_IQ4_NL = 20,
GGML_TYPE_IQ3_S = 21,
GGML_TYPE_IQ2_S = 22,
GGML_TYPE_IQ4_XS = 23,
GGML_TYPE_I8 = 24,
GGML_TYPE_I16 = 25,
GGML_TYPE_I32 = 26,
GGML_TYPE_I64 = 27,
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_I2_S = 36, // So we are able to consume MS BitNet I2_S quants
GGML_TYPE_MXFP4 = 39, // so we are compatible with mainline
GGML_TYPE_Q1_0_G128 = 41, // Bonsai 1-bit quants
//
// So we are able to consume MS BitNet I2_S quants
//
GGML_TYPE_I2_S = 36,
//
GGML_TYPE_Q8_0_X4 = 97,
GGML_TYPE_Q8_1_X4 = 98,
Expand Down Expand Up @@ -530,6 +529,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_Q1_0_128 = 29, // except 1d tensors
//
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_BN = 128, // except 1d tensors
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,16 @@ typedef struct {
} block_iq1_m_r4;
static_assert(sizeof(block_iq1_m_r4) == 28, "wrong iq1_m_r4 block size/padding");

//
// Bonsai
//
#define QK1_0_G128 128
typedef struct {
ggml_half d;
uint8_t qs[QK1_0_G128 / 8];
} block_q1_0_g128;
static_assert(sizeof(block_q1_0_g128) == sizeof(ggml_half) + QK1_0_G128 / 8, "wrong q1_0_g128 block size/padding");

//
// Bitnet and TriLM - implemented as 1.625 bpw
//
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -15435,6 +15435,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
case GGML_TYPE_IQ2_KT: break;
case GGML_TYPE_IQ3_KT: break;
case GGML_TYPE_IQ4_KT: break;
case GGML_TYPE_Q1_0_G128: break;
case GGML_TYPE_IQ3_K: break;
case GGML_TYPE_IQ3_KS: break;
case GGML_TYPE_IQ2_KL: break;
Expand Down
22 changes: 22 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1677,6 +1677,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.nrows = 1,
.row_meta_size = 4,
},
[GGML_TYPE_Q1_0_G128] = {
.type_name = "q1_0_g128",
.blck_size = QK1_0_G128,
.type_size = sizeof(block_q1_0_g128),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_q1_0_g128,
.from_float = quantize_row_q1_0_g128,
.from_float_ref = (ggml_from_float_t)quantize_row_q1_0_g128_ref,
.vec_dot = vec_dot_q1_0_g128_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0_X4,
.nrows = 1,
.row_meta_size = 0,
},
[GGML_TYPE_IQ3_K] = {
.type_name = "iq3_k",
.blck_size = QK_K,
Expand Down Expand Up @@ -4900,6 +4913,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ2_KT: wtype = GGML_TYPE_IQ2_KT; break;
case GGML_FTYPE_MOSTLY_IQ3_KT: wtype = GGML_TYPE_IQ3_KT; break;
case GGML_FTYPE_MOSTLY_IQ4_KT: wtype = GGML_TYPE_IQ4_KT; break;
case GGML_FTYPE_MOSTLY_Q1_0_128: wtype = GGML_TYPE_Q1_0_G128;break;
case GGML_FTYPE_MOSTLY_IQ3_K: wtype = GGML_TYPE_IQ3_K; break;
case GGML_FTYPE_MOSTLY_IQ3_KS: wtype = GGML_TYPE_IQ3_KS; break;
case GGML_FTYPE_MOSTLY_IQ2_KL: wtype = GGML_TYPE_IQ2_KL; break;
Expand Down Expand Up @@ -12817,6 +12831,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -13370,6 +13385,7 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -13549,6 +13565,7 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -17874,6 +17891,7 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -18297,6 +18315,7 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -18626,6 +18645,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -19383,6 +19403,7 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
case GGML_TYPE_Q1_0_G128:
case GGML_TYPE_IQ3_K:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ2_KL:
Expand Down Expand Up @@ -28464,6 +28485,7 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ2_KT: result = quantize_iq2_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_KT: result = quantize_iq3_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_KT: result = quantize_iq4_kt (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q1_0_G128: result = quantize_q1_0_g128(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_K: result = quantize_iq3_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ3_KS: result = quantize_iq3_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ2_KL: result = quantize_iq2_kl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
Expand Down
80 changes: 80 additions & 0 deletions ggml/src/iqk/iqk_gemm_1bit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,63 @@ IQK_NOINLINE void mul_mat_iq2bn_q8_K64(int n, const void * vx, size_t bx, const
}
}

template <int nrc_y>
static void mul_mat_q1_0_g128_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
constexpr int n4 = QK1_0_G128 / QK8_0;
Q8<nrc_y, block_q8_0> q8(info);
const block_q8_0_x4 * y[nrc_y];
for (int iy = 0; iy < nrc_y; ++iy) {
y[iy] = (const block_q8_0_x4 *)info.src1_row(iy);
}
__m256i shuffle[4] = {
_mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000),
_mm256_set_epi64x(0x0707070707070707, 0x0606060606060606, 0x0505050505050505, 0x0404040404040404),
_mm256_set_epi64x(0x0b0b0b0b0b0b0b0b, 0x0a0a0a0a0a0a0a0a, 0x0909090909090909, 0x0808080808080808),
_mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0e0e0e0e0e0e0e0e, 0x0d0d0d0d0d0d0d0d, 0x0c0c0c0c0c0c0c0c),
};
auto mask = _mm256_set1_epi64x(0x8040201008040201);
auto mp1 = _mm256_set1_epi8( 1);
auto mm1 = _mm256_set1_epi8(-1);
auto m1 = _mm256_set1_epi16(1);
int nb = n / QK1_0_G128;
__m256i qx[4];
__m256i sumi[4];
for (int ix = 0; ix < nrc_x; ++ix) {
auto x = (const block_q1_0_g128 *)((const char *)vx + ix*bx);
__m256 acc[nrc_y] = {};
for (int ib = 0; ib < nb; ++ib) {
float d = GGML_FP16_TO_FP32(x[ib].d);
auto vd = _mm256_set1_ps(d);
auto bits128 = _mm_loadu_si128((const __m128i *)x[ib].qs);
auto bits = MM256_SET_M128I(bits128, bits128);
for (int k = 0; k < 4; ++k) {
qx[k] = _mm256_shuffle_epi8(bits, shuffle[k]);
qx[k] = _mm256_cmpeq_epi8(_mm256_and_si256(qx[k], mask), mask);
qx[k] = _mm256_or_si256(_mm256_and_si256(qx[k], mp1), _mm256_andnot_si256(qx[k], mm1));
}
for (int iy = 0; iy < nrc_y; ++iy) {
for (int k = 0; k < n4; ++k) {
auto qy = _mm256_loadu_si256((const __m256i *)y[iy][ib].qs + k);
#ifdef HAVE_VNNI256
sumi[k] = _mm256_dpbusd_epi32(_mm256_setzero_si256(), mp1, _mm256_sign_epi8(qy, qx[k]));
#else
sumi[k] = _mm256_madd_epi16(m1, _mm256_maddubs_epi16(mp1, _mm256_sign_epi8(qy, qx[k])));
#endif
}
sumi[0] = _mm256_madd_epi16(m1, _mm256_packs_epi32(sumi[0], sumi[1]));
sumi[2] = _mm256_madd_epi16(m1, _mm256_packs_epi32(sumi[2], sumi[3]));
sumi[0] = _mm256_madd_epi16(m1, _mm256_packs_epi32(sumi[0], sumi[2]));
auto dy = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)y[iy][ib].d));
auto dxy= _mm256_mul_ps(vd, _mm256_set_m128(dy, dy));
acc[iy] = _mm256_fmadd_ps(dxy, _mm256_cvtepi32_ps(sumi[0]), acc[iy]);
}
}
for (int iy = 0; iy < nrc_y; ++iy) {
info.store(ix, iy, hsum_float_8(acc[iy]));
}
}
}

template <int nrc_y>
static void mul_mat_iq2_bn_r4_q8_k16_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
if (nrc_x%4) {
Expand Down Expand Up @@ -1903,6 +1960,11 @@ bool iqk_set_kernels_1bit(int ne00, int typeA, int typeB, std::array<mul_mat_t,
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_iq2_bn_r4_q8_k16, funcs);
expected_typeB = GGML_TYPE_Q8_K16;
break;
case GGML_TYPE_Q1_0_G128:
if (ne00 % QK1_0_G128 != 0) return false;
expected_typeB = GGML_TYPE_Q8_0_X4;
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q1_0_g128_q8_0, funcs);
break;

default:
return false;
Expand Down Expand Up @@ -2279,6 +2341,19 @@ static void mul_mat_iq2bn_q8_K64(int n, const void * vx, size_t bx, const DataIn
}
}

template <int nrc_y>
static void mul_mat_q1_0_g128_q8_0(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
Q8<nrc_y, block_q8_0_x4> q8(info);
for (int ix = 0; ix < nrc_x; ++ix) {
auto x = (const block_q1_0_g128 *)((const char *)vx + ix*bx);
for (int iy = 0; iy < nrc_y; ++iy) {
float s;
vec_dot_q1_0_g128_q8_0(n, &s, 0, x, bx, q8.y[iy], 0, 1);
info.store(ix, iy, s);
}
}
}

template <int nrc_y>
static void mul_mat_iq1_s_r4_q8_1(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
GGML_ASSERT(nrc_x%4 == 0);
Expand Down Expand Up @@ -2831,6 +2906,11 @@ bool iqk_set_kernels_1bit(int ne00, int typeA, int typeB, std::array<mul_mat_t,
func16 = mul_mat_iq1_m_r4_q8_0<16>;
expected_Btype = GGML_TYPE_Q8_K128;
break;
case GGML_TYPE_Q1_0_G128:
if (ne00 % QK1_0_G128 != 0) return false;
expected_Btype = GGML_TYPE_Q8_0_X4;
IQK_SET_MUL_MAT_FUNCTIONS(mul_mat_q1_0_g128_q8_0, funcs);
break;
default:
return false;
}
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/iqk/iqk_mul_mat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& mm, int Ny) {
case GGML_TYPE_IQ1_BN:
case GGML_TYPE_IQ2_BN:
case GGML_TYPE_IQ2_BN_R4:
case GGML_TYPE_Q1_0_G128:
return iqk_set_kernels_1bit(ne00, typeA, typeB, mm.funcs, mm.func16);

default:
Expand Down Expand Up @@ -1032,6 +1033,7 @@ bool MulMat::prepare(int typeA, int typeB, int ne00, MulMat& m, int /*Ny*/) {
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_IQ1_M_R4:
case GGML_TYPE_Q1_0_G128:
return iqk_set_kernels_1bit(ne00, typeA, typeB, m.funcs, m.func16);
case GGML_TYPE_IQ1_KT:
case GGML_TYPE_IQ2_KT:
Expand Down
Loading