Nexesenex
diff --git a/‎common/arg.cpp‎
Lines changed: 1 addition & 2 deletions b/‎common/arg.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎common/chat.cpp‎
Lines changed: 0 additions & 30 deletions b/‎common/chat.cpp‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎common/chat.h‎
Lines changed: 0 additions & 1 deletion b/‎common/chat.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 2 deletions b/‎common/common.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 0 additions & 114 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 0 additions & 114 deletions
diff --git a/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 37 deletions b/‎ggml/include/ggml.h‎
Lines changed: 1 addition & 37 deletions
diff --git a/‎ggml/src/ggml-alloc.c‎
Lines changed: 0 additions & 1 deletion b/‎ggml/src/ggml-alloc.c‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎ggml/src/ggml-common.h‎
Lines changed: 0 additions & 17 deletions b/‎ggml/src/ggml-common.h‎
Lines changed: 0 additions & 17 deletions
@@ -2949,12 +2949,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
         "- none: leaves thoughts unparsed in `message.content`\n"
         "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
-        "(default: auto)",
+        "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
-            else if (value == "auto") {     params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
 
@@ -606,7 +606,6 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
-        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -615,7 +614,6 @@ const char * common_chat_format_name(common_chat_format format) {
 const char * common_reasoning_format_name(common_reasoning_format format) {
     switch (format) {
         case COMMON_REASONING_FORMAT_NONE:     return "none";
-        case COMMON_REASONING_FORMAT_AUTO:     return "auto";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
         case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
         default:
@@ -1305,26 +1303,6 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
         tool_calls_end);
 }
 
-static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-    auto prompt = apply(tmpl, inputs);
-
-    data.prompt = prompt;
-    data.format = COMMON_CHAT_FORMAT_GPT_OSS;
-
-    // TODO: support tool calls in GPT-OSS?
-
-    return data;
-}
-static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
-    // TODO @ngxson : this won't work with --special enabled, we should fix that
-    builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
-    if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-}
-
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
     LOG_DBG("%s\n", __func__);
     common_chat_params data;
@@ -1810,11 +1788,6 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_hermes_2_pro(tmpl, params);
     }
 
-    // GPT-OSS
-    if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
-        return common_chat_params_init_gpt_oss(tmpl, params);
-    }
-
     // Use generic handler when mixing tools + JSON schema.
     // TODO: support that mix in handlers below.
     if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1966,9 +1939,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_COMMAND_R7B:
             common_chat_parse_command_r7b(builder);
             break;
-        case COMMON_CHAT_FORMAT_GPT_OSS:
-            common_chat_parse_gpt_oss(builder);
-            break;
         default:
             throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
     }
 
@@ -109,7 +109,6 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_GPT_OSS,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
 
@@ -232,7 +232,6 @@ struct common_params_diffusion {
 
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_AUTO,
     COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
     COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
@@ -391,7 +390,7 @@ struct common_params {
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
 
@@ -7950,119 +7950,6 @@ def set_vocab(self):
             self.gguf_writer.add_chat_template(chat_template)
 
 
-@ModelBase.register("GptOssForCausalLM")
-class GptOssModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.GPT_OSS
-
-    def transform_nibble_layout(self, tensor):
-        assert tensor.dtype == torch.uint8
-        assert tensor.shape[-1] == 16
-        # swap nibbles
-        t_lo = tensor & 0x0F
-        t_hi = tensor & 0xF0
-        t_swapped = (t_lo << 4) | (t_hi >> 4)
-        tensor = t_swapped
-        # transform aaaa...bbbb... to abababab...
-        blk_a, blk_b = tensor.chunk(2, dim=-1)
-        # get a_
-        blk_a0 = (blk_a & 0xF0).view(-1, 1)
-        blk_a1 = (blk_a << 4).view(-1, 1)
-        blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape)
-        # get _b
-        blk_b0 = (blk_b >> 4).view(-1, 1)
-        blk_b1 = (blk_b & 0x0F).view(-1, 1)
-        blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape)
-        # swap once more
-        out = blk_a | blk_b
-        out_h = out & 0xF0
-        out_l = out & 0x0F
-        out = (out_h >> 4) | (out_l << 4)
-        return out
-
-    def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
-        assert blocks.dtype == torch.uint8
-        assert scales.dtype == torch.uint8
-        scales = scales.unsqueeze(-1)
-        assert len(blocks.shape) == 4
-        assert len(scales.shape) == 4
-        blocks = self.transform_nibble_layout(blocks)
-        new_data = torch.concat((scales, blocks), dim=-1)
-        new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32]
-        logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4")
-        # flatten last dim
-        new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3])
-        new_data = new_data.numpy()
-        self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        blocks0: Tensor = torch.zeros(1)
-        blocks1: Tensor = torch.zeros(1)
-        found_mxfp4_tensors = False
-        # we assume that tensors are loaded in the correct order
-        for name, data_torch in self.get_tensors():
-            if "mlp.experts.down_proj_blocks" in name:
-                blocks0 = data_torch
-            elif "mlp.experts.down_proj_scales" in name:
-                new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
-                self.repack_mxfp4(new_name, blocks0, data_torch)
-                found_mxfp4_tensors = True
-            elif "mlp.experts.gate_up_proj_blocks" in name:
-                blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
-            elif "mlp.experts.gate_up_proj_scales" in name:
-                scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :]
-                new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight"))
-                new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
-                self.repack_mxfp4(new_name_gate, blocks0, scales0)
-                self.repack_mxfp4(new_name_up, blocks1, scales1)
-                found_mxfp4_tensors = True
-        if not found_mxfp4_tensors:
-            raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
-        return []
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        if "sinks" in name:
-            name += ".weight"
-
-        # correct naming for down_proj
-        if "down_proj" in name:
-            if name.endswith("_bias"):
-                name = name.replace("down_proj_bias", "down_proj.bias")
-            else:
-                return []
-
-        # split the gate_up into gate and up
-        if "gate_up_proj" in name:
-            if name.endswith("_bias"):
-                name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
-                name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
-                gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2]
-                return [
-                    (self.map_tensor_name(name_gate), gate_proj_bias),
-                    (self.map_tensor_name(name_up), up_proj_bias)
-                ]
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
-        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
-
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
-        assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-        self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-        self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
-
-
 @ModelBase.register("Lfm2ForCausalLM")
 @ModelBase.register("LFM2ForCausalLM")
 class LFM2Model(TextModel):
@@ -8202,7 +8089,6 @@ class LazyTorchTensor(gguf.LazyBase):
     _dtype_map: dict[torch.dtype, type] = {
         torch.float16: np.float16,
         torch.float32: np.float32,
-        torch.uint8: np.uint8,
     }
 
     # used for safetensors slices
 
@@ -310,16 +310,6 @@
     GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
     GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 
-#define GGML_TENSOR_TERNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb2, src2, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
 #define GGML_TENSOR_BINARY_OP_LOCALS01 \
     GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
     GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
@@ -411,8 +401,7 @@ extern "C" {
         GGML_TYPE_IQ4_NL_4_4 = 36, //deprecated upstream
         // GGML_TYPE_IQ4_NL_4_8 = 37,
         // GGML_TYPE_IQ4_NL_8_8 = 38,
-        GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
-        GGML_TYPE_COUNT   = 40,
+        GGML_TYPE_COUNT   = 39,
     };
 
     // precision
@@ -447,7 +436,6 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_MXFP4   = 25, // except 1d tensors
     };
 
     // available tensor operations:
@@ -456,7 +444,6 @@ extern "C" {
 
         GGML_OP_DUP,
         GGML_OP_ADD,
-        GGML_OP_ADD_ID,
         GGML_OP_ADD1,
         GGML_OP_ACC,
         GGML_OP_SUB,
@@ -576,7 +563,6 @@ extern "C" {
         GGML_GLU_OP_REGLU,
         GGML_GLU_OP_GEGLU,
         GGML_GLU_OP_SWIGLU,
-        GGML_GLU_OP_SWIGLU_OAI,
         GGML_GLU_OP_GEGLU_ERF,
         GGML_GLU_OP_GEGLU_QUICK,
 
@@ -858,13 +844,6 @@ extern "C" {
             struct ggml_tensor  * b,
             enum   ggml_type      type);
 
-    // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
-    GGML_API struct ggml_tensor * ggml_add_id(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * ids);
-
     GGML_API struct ggml_tensor * ggml_add1(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1232,13 +1211,6 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
-    GGML_API struct ggml_tensor * ggml_swiglu_oai(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            float                 alpha,
-            float                 limit);
-
     // normalize along rows
     GGML_API struct ggml_tensor * ggml_norm(
             struct ggml_context * ctx,
@@ -1611,10 +1583,6 @@ extern "C" {
             float                 scale,
             float                 max_bias);
 
-    GGML_API void ggml_soft_max_add_sinks(
-            struct ggml_tensor * a,
-            struct ggml_tensor * sinks);
-
     GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2097,10 +2065,6 @@ extern "C" {
     GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
             const struct ggml_tensor * a);
 
-    GGML_API void ggml_flash_attn_ext_add_sinks(
-            struct ggml_tensor * a,
-            struct ggml_tensor * sinks);
-
     // TODO: needs to be adapted to ggml_flash_attn_ext
     GGML_API struct ggml_tensor * ggml_flash_attn_back(
            struct ggml_context * ctx,
 
@@ -29,7 +29,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
         case GGML_OP_DIAG_MASK_ZERO:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_ADD:
-        case GGML_OP_ADD_ID:
         case GGML_OP_ADD1:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
 
@@ -99,9 +99,6 @@ typedef sycl::half2 ggml_half2;
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 #define QR4_1 2
 
-#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
-#define QR_MXFP4 2
-
 #define QI5_0 (QK5_0 / (4 * QR5_0))
 #define QR5_0 2
 
@@ -187,13 +184,6 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
 
-#define QK_MXFP4 32
-typedef struct {
-    uint8_t e; // E8M0
-    uint8_t qs[QK_MXFP4/2];
-} block_mxfp4;
-static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
-
 #define QK5_0 32
 typedef struct {
     ggml_half d;           // delta
@@ -1084,17 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
     0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
 GGML_TABLE_END()
 
-// TODO: fix name to kvalues_iq4_nl
 GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
     -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 GGML_TABLE_END()
 
-// e2m1 values (doubled)
-// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
-    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
-GGML_TABLE_END()
-
 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
 #define IQ1M_DELTA 0.125f