Skip to content

Commit ddbba17

Browse files
committed
Revert "llama : add gpt-oss (ggml-org#15091)"
1 parent 61c19fe commit ddbba17

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+157
-2671
lines changed

common/arg.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2949,12 +2949,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29492949
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
29502950
"- none: leaves thoughts unparsed in `message.content`\n"
29512951
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2952-
"(default: auto)",
2952+
"(default: deepseek)",
29532953
[](common_params & params, const std::string & value) {
29542954
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
29552955
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
29562956
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2957-
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
29582957
else { throw std::invalid_argument("invalid value"); }
29592958
}
29602959
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));

common/chat.cpp

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,6 @@ const char * common_chat_format_name(common_chat_format format) {
606606
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
607607
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
608608
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
609-
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
610609
default:
611610
throw std::runtime_error("Unknown chat format");
612611
}
@@ -615,7 +614,6 @@ const char * common_chat_format_name(common_chat_format format) {
615614
const char * common_reasoning_format_name(common_reasoning_format format) {
616615
switch (format) {
617616
case COMMON_REASONING_FORMAT_NONE: return "none";
618-
case COMMON_REASONING_FORMAT_AUTO: return "auto";
619617
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
620618
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
621619
default:
@@ -1305,26 +1303,6 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
13051303
tool_calls_end);
13061304
}
13071305

1308-
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1309-
common_chat_params data;
1310-
auto prompt = apply(tmpl, inputs);
1311-
1312-
data.prompt = prompt;
1313-
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1314-
1315-
// TODO: support tool calls in GPT-OSS?
1316-
1317-
return data;
1318-
}
1319-
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1320-
// TODO @ngxson : this won't work with --special enabled, we should fix that
1321-
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1322-
if (!builder.syntax().parse_tool_calls) {
1323-
builder.add_content(builder.consume_rest());
1324-
return;
1325-
}
1326-
}
1327-
13281306
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
13291307
LOG_DBG("%s\n", __func__);
13301308
common_chat_params data;
@@ -1810,11 +1788,6 @@ static common_chat_params common_chat_templates_apply_jinja(
18101788
return common_chat_params_init_hermes_2_pro(tmpl, params);
18111789
}
18121790

1813-
// GPT-OSS
1814-
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
1815-
return common_chat_params_init_gpt_oss(tmpl, params);
1816-
}
1817-
18181791
// Use generic handler when mixing tools + JSON schema.
18191792
// TODO: support that mix in handlers below.
18201793
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1966,9 +1939,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
19661939
case COMMON_CHAT_FORMAT_COMMAND_R7B:
19671940
common_chat_parse_command_r7b(builder);
19681941
break;
1969-
case COMMON_CHAT_FORMAT_GPT_OSS:
1970-
common_chat_parse_gpt_oss(builder);
1971-
break;
19721942
default:
19731943
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
19741944
}

common/chat.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ enum common_chat_format {
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110110
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111111
COMMON_CHAT_FORMAT_COMMAND_R7B,
112-
COMMON_CHAT_FORMAT_GPT_OSS,
113112

114113
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
115114
};

common/common.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ struct common_params_diffusion {
232232

233233
enum common_reasoning_format {
234234
COMMON_REASONING_FORMAT_NONE,
235-
COMMON_REASONING_FORMAT_AUTO,
236235
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
237236
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
238237
};
@@ -391,7 +390,7 @@ struct common_params {
391390
std::string chat_template = ""; // NOLINT
392391
bool use_jinja = false; // NOLINT
393392
bool enable_chat_template = true;
394-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
393+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
395394
int reasoning_budget = -1;
396395
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
397396

convert_hf_to_gguf.py

Lines changed: 0 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -7950,119 +7950,6 @@ def set_vocab(self):
79507950
self.gguf_writer.add_chat_template(chat_template)
79517951

79527952

7953-
@ModelBase.register("GptOssForCausalLM")
7954-
class GptOssModel(TextModel):
7955-
model_arch = gguf.MODEL_ARCH.GPT_OSS
7956-
7957-
def transform_nibble_layout(self, tensor):
7958-
assert tensor.dtype == torch.uint8
7959-
assert tensor.shape[-1] == 16
7960-
# swap nibbles
7961-
t_lo = tensor & 0x0F
7962-
t_hi = tensor & 0xF0
7963-
t_swapped = (t_lo << 4) | (t_hi >> 4)
7964-
tensor = t_swapped
7965-
# transform aaaa...bbbb... to abababab...
7966-
blk_a, blk_b = tensor.chunk(2, dim=-1)
7967-
# get a_
7968-
blk_a0 = (blk_a & 0xF0).view(-1, 1)
7969-
blk_a1 = (blk_a << 4).view(-1, 1)
7970-
blk_a = torch.stack((blk_a0, blk_a1), dim=2).view(tensor.shape)
7971-
# get _b
7972-
blk_b0 = (blk_b >> 4).view(-1, 1)
7973-
blk_b1 = (blk_b & 0x0F).view(-1, 1)
7974-
blk_b = torch.stack((blk_b0, blk_b1), dim=2).view(tensor.shape)
7975-
# swap once more
7976-
out = blk_a | blk_b
7977-
out_h = out & 0xF0
7978-
out_l = out & 0x0F
7979-
out = (out_h >> 4) | (out_l << 4)
7980-
return out
7981-
7982-
def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
7983-
assert blocks.dtype == torch.uint8
7984-
assert scales.dtype == torch.uint8
7985-
scales = scales.unsqueeze(-1)
7986-
assert len(blocks.shape) == 4
7987-
assert len(scales.shape) == 4
7988-
blocks = self.transform_nibble_layout(blocks)
7989-
new_data = torch.concat((scales, blocks), dim=-1)
7990-
new_shape = [new_data.shape[0], new_data.shape[1], new_data.shape[2] * 32]
7991-
logger.info(f"Repacked {new_name} with shape {new_shape} and quantization MXFP4")
7992-
# flatten last dim
7993-
new_data = new_data.view(new_data.shape[0], new_data.shape[1], new_data.shape[2] * new_data.shape[3])
7994-
new_data = new_data.numpy()
7995-
self.gguf_writer.add_tensor(new_name, new_data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
7996-
7997-
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
7998-
blocks0: Tensor = torch.zeros(1)
7999-
blocks1: Tensor = torch.zeros(1)
8000-
found_mxfp4_tensors = False
8001-
# we assume that tensors are loaded in the correct order
8002-
for name, data_torch in self.get_tensors():
8003-
if "mlp.experts.down_proj_blocks" in name:
8004-
blocks0 = data_torch
8005-
elif "mlp.experts.down_proj_scales" in name:
8006-
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
8007-
self.repack_mxfp4(new_name, blocks0, data_torch)
8008-
found_mxfp4_tensors = True
8009-
elif "mlp.experts.gate_up_proj_blocks" in name:
8010-
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
8011-
elif "mlp.experts.gate_up_proj_scales" in name:
8012-
scales0, scales1 = data_torch[:, ::2, :], data_torch[:, 1::2, :]
8013-
new_name_gate = self.map_tensor_name(name.replace("gate_up_proj_scales", "gate_proj.weight"))
8014-
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
8015-
self.repack_mxfp4(new_name_gate, blocks0, scales0)
8016-
self.repack_mxfp4(new_name_up, blocks1, scales1)
8017-
found_mxfp4_tensors = True
8018-
if not found_mxfp4_tensors:
8019-
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
8020-
return []
8021-
8022-
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8023-
del bid # unused
8024-
8025-
if "sinks" in name:
8026-
name += ".weight"
8027-
8028-
# correct naming for down_proj
8029-
if "down_proj" in name:
8030-
if name.endswith("_bias"):
8031-
name = name.replace("down_proj_bias", "down_proj.bias")
8032-
else:
8033-
return []
8034-
8035-
# split the gate_up into gate and up
8036-
if "gate_up_proj" in name:
8037-
if name.endswith("_bias"):
8038-
name_up = name.replace("gate_up_proj_bias", "up_proj.bias")
8039-
name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias")
8040-
gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2]
8041-
return [
8042-
(self.map_tensor_name(name_gate), gate_proj_bias),
8043-
(self.map_tensor_name(name_up), up_proj_bias)
8044-
]
8045-
else:
8046-
return []
8047-
8048-
return [(self.map_tensor_name(name), data_torch)]
8049-
8050-
def set_vocab(self):
8051-
self._set_vocab_gpt2()
8052-
8053-
def set_gguf_parameters(self):
8054-
super().set_gguf_parameters()
8055-
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
8056-
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])
8057-
8058-
rope_scaling = self.hparams.get("rope_scaling") or {}
8059-
rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
8060-
assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
8061-
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
8062-
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
8063-
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
8064-
8065-
80667953
@ModelBase.register("Lfm2ForCausalLM")
80677954
@ModelBase.register("LFM2ForCausalLM")
80687955
class LFM2Model(TextModel):
@@ -8202,7 +8089,6 @@ class LazyTorchTensor(gguf.LazyBase):
82028089
_dtype_map: dict[torch.dtype, type] = {
82038090
torch.float16: np.float16,
82048091
torch.float32: np.float32,
8205-
torch.uint8: np.uint8,
82068092
}
82078093

82088094
# used for safetensors slices

ggml/include/ggml.h

Lines changed: 1 addition & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -310,16 +310,6 @@
310310
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
311311
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
312312

313-
#define GGML_TENSOR_TERNARY_OP_LOCALS \
314-
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
315-
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
316-
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
317-
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
318-
GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
319-
GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
320-
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
321-
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
322-
323313
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
324314
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
325315
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
@@ -411,8 +401,7 @@ extern "C" {
411401
GGML_TYPE_IQ4_NL_4_4 = 36, //deprecated upstream
412402
// GGML_TYPE_IQ4_NL_4_8 = 37,
413403
// GGML_TYPE_IQ4_NL_8_8 = 38,
414-
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
415-
GGML_TYPE_COUNT = 40,
404+
GGML_TYPE_COUNT = 39,
416405
};
417406

418407
// precision
@@ -447,7 +436,6 @@ extern "C" {
447436
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
448437
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
449438
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
450-
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
451439
};
452440

453441
// available tensor operations:
@@ -456,7 +444,6 @@ extern "C" {
456444

457445
GGML_OP_DUP,
458446
GGML_OP_ADD,
459-
GGML_OP_ADD_ID,
460447
GGML_OP_ADD1,
461448
GGML_OP_ACC,
462449
GGML_OP_SUB,
@@ -576,7 +563,6 @@ extern "C" {
576563
GGML_GLU_OP_REGLU,
577564
GGML_GLU_OP_GEGLU,
578565
GGML_GLU_OP_SWIGLU,
579-
GGML_GLU_OP_SWIGLU_OAI,
580566
GGML_GLU_OP_GEGLU_ERF,
581567
GGML_GLU_OP_GEGLU_QUICK,
582568

@@ -858,13 +844,6 @@ extern "C" {
858844
struct ggml_tensor * b,
859845
enum ggml_type type);
860846

861-
// dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
862-
GGML_API struct ggml_tensor * ggml_add_id(
863-
struct ggml_context * ctx,
864-
struct ggml_tensor * a,
865-
struct ggml_tensor * b,
866-
struct ggml_tensor * ids);
867-
868847
GGML_API struct ggml_tensor * ggml_add1(
869848
struct ggml_context * ctx,
870849
struct ggml_tensor * a,
@@ -1232,13 +1211,6 @@ extern "C" {
12321211
struct ggml_tensor * a,
12331212
struct ggml_tensor * b);
12341213

1235-
GGML_API struct ggml_tensor * ggml_swiglu_oai(
1236-
struct ggml_context * ctx,
1237-
struct ggml_tensor * a,
1238-
struct ggml_tensor * b,
1239-
float alpha,
1240-
float limit);
1241-
12421214
// normalize along rows
12431215
GGML_API struct ggml_tensor * ggml_norm(
12441216
struct ggml_context * ctx,
@@ -1611,10 +1583,6 @@ extern "C" {
16111583
float scale,
16121584
float max_bias);
16131585

1614-
GGML_API void ggml_soft_max_add_sinks(
1615-
struct ggml_tensor * a,
1616-
struct ggml_tensor * sinks);
1617-
16181586
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
16191587
struct ggml_context * ctx,
16201588
struct ggml_tensor * a,
@@ -2097,10 +2065,6 @@ extern "C" {
20972065
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
20982066
const struct ggml_tensor * a);
20992067

2100-
GGML_API void ggml_flash_attn_ext_add_sinks(
2101-
struct ggml_tensor * a,
2102-
struct ggml_tensor * sinks);
2103-
21042068
// TODO: needs to be adapted to ggml_flash_attn_ext
21052069
GGML_API struct ggml_tensor * ggml_flash_attn_back(
21062070
struct ggml_context * ctx,

ggml/src/ggml-alloc.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
2929
case GGML_OP_DIAG_MASK_ZERO:
3030
case GGML_OP_DIAG_MASK_INF:
3131
case GGML_OP_ADD:
32-
case GGML_OP_ADD_ID:
3332
case GGML_OP_ADD1:
3433
case GGML_OP_SUB:
3534
case GGML_OP_MUL:

ggml/src/ggml-common.h

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,6 @@ typedef sycl::half2 ggml_half2;
9999
#define QI4_1 (QK4_1 / (4 * QR4_1))
100100
#define QR4_1 2
101101

102-
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
103-
#define QR_MXFP4 2
104-
105102
#define QI5_0 (QK5_0 / (4 * QR5_0))
106103
#define QR5_0 2
107104

@@ -187,13 +184,6 @@ typedef struct {
187184
} block_q4_1;
188185
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
189186

190-
#define QK_MXFP4 32
191-
typedef struct {
192-
uint8_t e; // E8M0
193-
uint8_t qs[QK_MXFP4/2];
194-
} block_mxfp4;
195-
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
196-
197187
#define QK5_0 32
198188
typedef struct {
199189
ggml_half d; // delta
@@ -1084,17 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
10841074
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
10851075
GGML_TABLE_END()
10861076

1087-
// TODO: fix name to kvalues_iq4_nl
10881077
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
10891078
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
10901079
GGML_TABLE_END()
10911080

1092-
// e2m1 values (doubled)
1093-
// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
1094-
GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
1095-
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
1096-
GGML_TABLE_END()
1097-
10981081
#define NGRID_IQ1S 2048
10991082
#define IQ1S_DELTA 0.125f
11001083
#define IQ1M_DELTA 0.125f

0 commit comments

Comments
 (0)