Skip to content
Merged

MXFP4 #682

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 28 additions & 18 deletions common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,28 +82,38 @@ bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
}

bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
auto start_pos = input_.find(start_think, pos_);
if (start_pos == std::string::npos) {
return false;
}
auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
auto stripped_reasoning = string_strip(reasoning);
if (stripped_reasoning.empty()) {
return;
}
if (syntax_.reasoning_in_content) {
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "<think>" : start_think);
add_content(stripped_reasoning);
if (closed) {
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
}
} else {
add_reasoning_content(stripped_reasoning);
}
};

auto end_pos = input_.find(end_think, start_pos + start_think.size());
if (end_pos == std::string::npos) {
if (is_partial_) {
// Partial reasoning content
auto reasoning = input_.substr(start_pos + start_think.size());
add_reasoning_content(string_strip(reasoning));
pos_ = input_.size();
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
if (auto res = try_find_literal(end_think)) {
handle_reasoning(res->prelude, /* closed */ true);
consume_spaces();
return true;
}
auto rest = consume_rest();
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
// Allow unclosed thinking tags for now (following original llama.cpp)
return true;
}
return false;
}

// Extract reasoning content
auto reasoning = input_.substr(start_pos + start_think.size(), end_pos - start_pos - start_think.size());
add_reasoning_content(string_strip(reasoning));
pos_ = end_pos + end_think.size();
return true;
return false;
}

std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_literal_legacy(const std::string & literal) {
Expand Down
3 changes: 3 additions & 0 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
throw; // Re-throw for partial mode
}
}

// Add any remaining content (critical for responses without tool calls)
builder.add_content(builder.consume_rest());
}

// Parse DeepSeek R1 tools array format following original llama.cpp parse_prefixed_json_tool_call_array pattern
Expand Down
10 changes: 10 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,18 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_KIMI_K2, // Our custom format (keep last for backward compatibility)
};

enum common_reasoning_format {
COMMON_REASONING_FORMAT_NONE,
COMMON_REASONING_FORMAT_DEEPSEEK,
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY,
};

struct common_chat_syntax {
common_chat_format format = COMMON_CHAT_FORMAT_KIMI_K2;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool enable_thinking = false;
bool enable_tool_calls = true;
};
Expand Down
20 changes: 20 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "--cpu-moe" || arg == "-cmoe") {
params.tensor_buft_overrides.push_back({strdup("\\.ffn_(up|down|gate)_exps"), ggml_backend_cpu_buffer_type()});
return true;
}
if (arg == "--n-cpu-moe" || arg == "-ncmoe") {
CHECK_ARG
int32_t n_layers = std::stoi(argv[i]);
if (n_layers < 0) {
fprintf(stderr, "error: Invalid value for --n-cpu-moe: %d (must be >= 0)\n", n_layers);
invalid_param = true;
return true;
}
for (int32_t l = 0; l < n_layers; ++l) {
std::string pattern = "blk\\." + std::to_string(l) + "\\.(ffn_(up|down|gate)_exps)";
params.tensor_buft_overrides.push_back({strdup(pattern.c_str()), ggml_backend_cpu_buffer_type()});
}
return true;
}
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
Expand Down Expand Up @@ -1794,6 +1812,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
}
options.push_back({ "*", " --run-time-repack", "repack tensors if interleaved variant is available"});
options.push_back({ "*", " --cpu-moe", "keep all MoE weights in CPU memory"});
options.push_back({ "*", " --n-cpu-moe N", "keep MoE weights of the first N layers in CPU memory"});
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
" - distribute: spread execution evenly over all nodes\n"
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
Expand Down
1 change: 1 addition & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
{ "Q6_0", LLAMA_FTYPE_MOSTLY_Q6_0, " 6.5 bpw quantization", },
{ "MXFP4", LLAMA_FTYPE_MOSTLY_MXFP4, " 4.25 bpw 4-bit float quantization",},
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
{ "IQ2_XXS_R4",LLAMA_FTYPE_MOSTLY_IQ2_XXS_R4,"IQ2_XXS repacked", },
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
Expand Down
2 changes: 2 additions & 0 deletions examples/server/function_calls.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ static ik_chat_msg parse_chat_message_incremental(const std::string& content, bo
try {
common_chat_syntax syntax;
syntax.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
syntax.reasoning_in_content = true; // Fix for thinking tag termination issue
syntax.enable_tool_calls = true;

common_chat_msg_parser parser(content, is_partial, syntax);
Expand Down
8 changes: 5 additions & 3 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ extern "C" {
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_MXFP4 = 39, // so we are compatible with mainline
//
// So we are able to consume MS BitNet I2_S quants
//
Expand Down Expand Up @@ -507,9 +508,10 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors, using 26 to be compatible with mainline
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 28, // except 1d tensors
//
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_BN = 128, // except 1d tensors
Expand Down
18 changes: 18 additions & 0 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ typedef sycl::half2 ggml_half2;
#define QI1_BN (QK_IQ1BN / (4*QR1_BN))
#define QR1_BN 8

#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
#define QR_MXFP4 2

#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

#define QK4_0 32
Expand All @@ -174,6 +177,15 @@ typedef struct {
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");

// This is unfortunate (block is 17 bytes, so not even a 2-byte alignment)
// But to be able to use MXFP4-quantized models from mainline, we do the same.
#define QK_MXFP4 32
typedef struct {
uint8_t e; // E8M0
uint8_t qs[QK_MXFP4/2];
} block_mxfp4;
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");

#define QK5_0 32
typedef struct {
ggml_half d; // delta
Expand Down Expand Up @@ -2211,5 +2223,11 @@ GGML_TABLE_BEGIN(int8_t, iq6nl_values, 128)
48, 52, 56, 60, 64, 69, 73, 78, 83, 88, 93, 99, 104, 110, 116, 122,
GGML_TABLE_END()

// e2m1 values (doubled)
// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
GGML_TABLE_END()

#endif // GGML_COMMON_IMPL
#endif // GGML_COMMON_IMPL
3 changes: 2 additions & 1 deletion ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
}

const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1) {
if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) {
//printf("invoking fast path for %s x %s\n", src0->name, src1->name);
int id = ctx.device;
char * src0_dd_i = dev[id].src0_dd;
Expand Down Expand Up @@ -3498,6 +3498,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ2_KL:
case GGML_TYPE_IQ3_KS:
Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
static constexpr int qi = QI4_NL;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
static constexpr int qk = QK4_NL;
static constexpr int qr = QR4_NL;
static constexpr int qi = QI4_NL;
};

template<>
struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
static constexpr int qk = QK_K;
Expand Down
32 changes: 32 additions & 0 deletions ggml/src/ggml-cuda/convert.cu
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,27 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
}
}

template<typename dst_t>
static __global__ void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy) {

constexpr uint32_t uval[2] = { 0x00200000, 0x00400000 };
const int64_t i = blockIdx.x;
const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK4_NL);

const int64_t tid = threadIdx.x;
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il;
union { float f; uint32_t u; } helper;
helper.u = x[ib].e >= 2 ? uint32_t(x[ib].e - 1) << 23u : uval[x[ib].e];
const float d = helper.f;
for (int j = 0; j < 4; ++j) {
y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf];
y[j+16] = d * kvalues_mxfp4[q4[j] >> 4];
}
}

template<typename dst_t>
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
const int64_t i = blockIdx.x;
Expand Down Expand Up @@ -1611,6 +1632,13 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
const int64_t k = nrows * n_per_row;
const int nb = (k + QK_K - 1) / QK_K;
dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
}

template<typename dst_t>
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t nrows, const int64_t n_per_row, cudaStream_t stream) {
const int64_t k = nrows * n_per_row;
Expand Down Expand Up @@ -1943,6 +1971,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
return dequantize_row_iq2_bn_cuda;
case GGML_TYPE_IQ4_NL:
return dequantize_row_iq4_nl_cuda;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_cuda;
case GGML_TYPE_IQ4_XS:
return dequantize_row_iq4_xs_cuda;
case GGML_TYPE_IQ4_KS:
Expand Down Expand Up @@ -2044,6 +2074,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
return dequantize_row_iq2_bn_cuda;
case GGML_TYPE_IQ4_NL:
return dequantize_row_iq4_nl_cuda;
case GGML_TYPE_MXFP4:
return dequantize_row_mxfp4_cuda;
case GGML_TYPE_IQ4_XS:
return dequantize_row_iq4_xs_cuda;
case GGML_TYPE_IQ4_KS:
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ void ggml_cuda_op_mul_mat_q(
const int64_t src1_padded_row_size, cudaStream_t stream) {

const int64_t ne00 = src0->ne[0];
const int64_t nb01 = src0->nb[1];
const int64_t nb01 = ggml_row_size(src0->type, ne00);

const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
Expand Down Expand Up @@ -94,6 +94,9 @@ void ggml_cuda_op_mul_mat_q(
case GGML_TYPE_IQ4_NL:
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
break;
case GGML_TYPE_MXFP4:
mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
break;
case GGML_TYPE_IQ2_KL:
mul_mat_q_case<GGML_TYPE_IQ2_KL>(ctx, args, stream);
break;
Expand Down Expand Up @@ -210,6 +213,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
case GGML_TYPE_IQ1_S_R4:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4:
case GGML_TYPE_IQ2_KL:
case GGML_TYPE_IQ3_KS:
case GGML_TYPE_IQ4_KSS:
Expand Down
Loading