Skip to content
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
a7df116
qwen3next: add architecture support and recurrent-state fixes
Feb 6, 2026
9fbb504
qwen3next: optimize broadcast sub and single-seq ssm conv
Feb 6, 2026
89e9ecf
cuda: build MoE row mapping on device in mul_mat_id
Feb 6, 2026
236633a
cuda: add guarded multi-seq fast path for ssm_conv
Feb 6, 2026
c767cfa
docs: update qwen3next perf report for cuda MoE/SSM tuning
Feb 6, 2026
e64b433
cuda: reduce qwen3next moe/ssm sync overhead and refresh eval
Feb 6, 2026
6db8dc8
qwen3next: split cpu/cuda eval builds and tune PP scheduling
Feb 7, 2026
fffd27e
qwen3next: harden seq-state flow and support optional dense FFN layers
Feb 7, 2026
a1163d0
qwen3next: trim delta-net graph overhead in chunking path
Feb 7, 2026
0e3891b
qwen3next: remove redundant v_conv cont in delta path
Feb 7, 2026
43edfa2
qwen3next: avoid extra cont on linear attention output
Feb 7, 2026
de5bf44
qwen3next: drop redundant cont before recurrent state flatten
Feb 7, 2026
5a6c4e8
qwen3next: keep recurrent state in 4d layout through delta path
Feb 7, 2026
6dd990d
qwen3next: add fused delta-net op and wire model path
Feb 7, 2026
ed0565f
tests: add backend-op coverage for ggml_delta_net
Feb 7, 2026
b33cef6
qwen3next: add runtime switch for fused delta-net path
Feb 8, 2026
81e788e
docs: refresh qwen3next perf review and benchmark matrix
Feb 8, 2026
9930f4d
qwen3next: default fused delta-net off and document quality checks
Feb 8, 2026
143e88a
qwen3next: add decode-only fused delta mode
Feb 8, 2026
64099e7
qwen3next: make fused delta safe by default and fix fused tensor layout
Feb 8, 2026
343e335
qwen3next: warn when forcing fused decode mode
Feb 8, 2026
44db394
qwen3next: add fused-delta regression runner script
Feb 8, 2026
55270b0
qwen3next: integrate fused regression into eval harness
Feb 8, 2026
670434e
qwen3next: clean up chunked delta-net shape handling
Feb 8, 2026
691df60
qwen3next: add absolute sanity guards to fused regression
Feb 8, 2026
a822db6
qwen3next: add unified regression runner script
Feb 8, 2026
627d469
qwen3next: disable flash-attn for cpu-only contexts
Feb 8, 2026
bd0dd78
docs: reconcile qwen3next status and remaining upstream gaps
Feb 8, 2026
b5c9554
common: add qwen3next fused-delta runtime flag
Feb 8, 2026
eef360a
cuda: add qwen3next delta-net kernel dispatch override
Feb 8, 2026
69529d3
docs: update qwen3next quality and serving baseline findings
Feb 8, 2026
48e0e35
qwen3next: keep fused delta on safe path and remove PR artifacts
Feb 9, 2026
9241164
qwen3next: align autoregressive delta-net decode layout
Feb 9, 2026
6009557
Revert "qwen3next: align autoregressive delta-net decode layout"
Feb 9, 2026
113ad6c
cuda: port solve-tri fast-paths for qwen3next delta-net
Feb 9, 2026
6f21f24
qwen3next: add fused-delta runtime flag and drop env toggle
Feb 9, 2026
f1f6da7
qwen3next: make fused delta single-flag and default on
Feb 9, 2026
4ab02c9
Account for GPU arch differences
Feb 10, 2026
117ff5d
Revert "cuda: build MoE row mapping on device in mul_mat_id"
Feb 10, 2026
6d8fb70
qwen3next: drop non-essential MoE scheduling and split heuristics
Feb 10, 2026
ed10c94
qwen3next: avoid generic ggml_sub broadcast changes
Feb 10, 2026
4e55ac7
llama: restore only_active_experts log message
Feb 10, 2026
71035bf
Merge branch 'ikawrakow:main' into main
YurkoHoshko Feb 10, 2026
012377b
Remove unnecessary hacks, disable fusion for now.
Feb 11, 2026
b7781f2
qwen3next: port hybrid recurrent state memory semantics
Feb 11, 2026
d7b6358
qwen3next: clean up recurrent state slot plumbing
Feb 11, 2026
aaa1b12
qwen3next: fix hybrid V-cache layout plumbing
Feb 11, 2026
cac3c5f
qwen3next: guard recurrent state slots against kv capacity
Feb 11, 2026
c771416
qwen3next: persist recurrent state in session data
Feb 11, 2026
dd690cb
qwen3next: drop unused fused-delta builder path
Feb 11, 2026
3470e8a
qwen3next: remove unused fused-delta CLI/context plumbing
Feb 11, 2026
cb99ab7
ggml: remove unused DELTA_NET operator stack
Feb 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
482 changes: 482 additions & 0 deletions docs/development/qwen3next_perf_diff_report.md
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doc was used solely by AI during development and I included it just to keep track of what was going on - please ignore.

Large diffs are not rendered by default.

80 changes: 80 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ extern "C" {
GGML_OP_LOG,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_CUMSUM,
GGML_OP_MEAN,
GGML_OP_ARGMAX,
GGML_OP_REPEAT,
Expand All @@ -611,6 +612,7 @@ extern "C" {
GGML_OP_RMS_NORM,
GGML_OP_RMS_NORM_BACK,
GGML_OP_GROUP_NORM,
GGML_OP_L2_NORM,
GGML_OP_FUSED_RMS_NORM,
GGML_OP_FUSED_MUL_UNARY,
GGML_OP_MULTI_ADD,
Expand Down Expand Up @@ -653,6 +655,8 @@ extern "C" {
GGML_OP_PAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_TRI,
GGML_OP_FILL,
GGML_OP_ARGSORT,
GGML_OP_ARGSORT_THRESH,
GGML_OP_GROUPED_TOPK,
Expand All @@ -671,6 +675,7 @@ extern "C" {
GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS,
GGML_OP_SOLVE_TRI,
GGML_OP_UNARY,

GGML_OP_MAP_UNARY,
Expand Down Expand Up @@ -710,6 +715,8 @@ extern "C" {
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,
GGML_UNARY_OP_SOFTPLUS,
GGML_UNARY_OP_SWIGLU,
GGML_UNARY_OP_SWIGLU_OAI,
GGML_UNARY_OP_GELU,
Expand Down Expand Up @@ -739,6 +746,13 @@ extern "C" {
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
};

enum ggml_tri_type {
GGML_TRI_TYPE_LOWER,
GGML_TRI_TYPE_UPPER,
GGML_TRI_TYPE_LOWER_DIAG,
GGML_TRI_TYPE_UPPER_DIAG,
};

// ggml object
struct ggml_object {
size_t offs;
Expand Down Expand Up @@ -1189,6 +1203,14 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_softplus(
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_softplus_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);

// return scalar
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
Expand All @@ -1199,6 +1221,10 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_cumsum(
struct ggml_context * ctx,
struct ggml_tensor * a);

// mean along rows
GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx,
Expand All @@ -1217,6 +1243,15 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);

// repeat a to specified shape
GGML_API struct ggml_tensor * ggml_repeat_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);

// sums repetitions in a into shape of b
GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,
Expand Down Expand Up @@ -1455,6 +1490,14 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_exp(
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_exp_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);

// normalize along rows
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
Expand Down Expand Up @@ -1514,6 +1557,17 @@ extern "C" {
int n_groups,
float eps);

// l2 normalize along rows
GGML_API struct ggml_tensor * ggml_l2_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps);

GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float eps);

// a - x
// b - dy
GGML_API struct ggml_tensor * ggml_rms_norm_back(
Expand Down Expand Up @@ -2283,6 +2337,23 @@ extern "C" {
int dim,
int max_period);

// convert matrix to triangular form by zeroing values outside selected half
GGML_API struct ggml_tensor * ggml_tri(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_tri_type type);

// fill tensor with constant c
GGML_API struct ggml_tensor * ggml_fill(
struct ggml_context * ctx,
struct ggml_tensor * a,
float c);

GGML_API struct ggml_tensor * ggml_fill_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
float c);

// sort rows
enum ggml_sort_order {
GGML_SORT_ORDER_ASC,
Expand Down Expand Up @@ -2426,6 +2497,15 @@ extern "C" {
struct ggml_tensor * pw,
struct ggml_tensor * ph);

// Solve Ax = B where A is triangular
GGML_API struct ggml_tensor * ggml_solve_tri(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
bool left,
bool lower,
bool uni);

// custom operators

typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
Expand Down
5 changes: 3 additions & 2 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif

#ifndef GGML_SCHED_MAX_SPLITS
#define GGML_SCHED_MAX_SPLITS 2048
#define GGML_SCHED_MAX_SPLITS 4096
#endif

#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
Expand Down Expand Up @@ -1731,7 +1731,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src);
if (src_backend_id != cur_backend_id) {
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && !supported) {
need_new_split = true;
break;
}
Expand Down
Loading