Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
245f391
graph : reuse hybrid graphs
ggerganov Oct 9, 2025
638e2c2
graph : reuse recurrent graphs
ggerganov Oct 9, 2025
0b9c1ae
metal : fix mul-mm condition + fix mul-mv permuted kernels
ggerganov Oct 9, 2025
1f02d93
graph : fix reuse check for recurrent inputs
ggerganov Oct 10, 2025
00f115f
memory : move the recurrent state into the memory context
ggerganov Oct 10, 2025
2744d61
Revert "memory : move the recurrent state into the memory context"
ggerganov Oct 10, 2025
ab3f3fe
Merge branch 'gg/metal-mul-mat-fixes' into gg/graph-mamba-reuse
gabe-l-hart Oct 10, 2025
8c23c43
Added: tri, cumsum. Still a mess.
gabe-l-hart Oct 10, 2025
2a2e79c
feat(tests): Add --verbose | -v flag to test-backend-ops to print ten…
gabe-l-hart Oct 10, 2025
092f740
test: Add cumsum tests to test-backend-ops
gabe-l-hart Oct 10, 2025
6949ce7
feat(ggml-cpu): Add cumsum support for f16 and bf16
gabe-l-hart Oct 10, 2025
f8fba60
feat(ggml-cpu): Add F16 and BF16 support for tri
gabe-l-hart Oct 13, 2025
058160a
test: Add test cases for tri
gabe-l-hart Oct 13, 2025
86ce3da
chore: TODOs to loosen assertions in tri for ggml_is_contiguous
gabe-l-hart Oct 13, 2025
3a8958f
feat(ggml-metal): Initial (slow) implementation of cumsum for metal
gabe-l-hart Oct 13, 2025
cbaed86
feat(ggml-metal): Add stubs for metal tri
gabe-l-hart Oct 13, 2025
e596469
test: Use looser nmse for lower-precision types for cumsum
gabe-l-hart Oct 13, 2025
3011a6e
Merge remote-tracking branch 'origin/master' into Mamba2SSD
gabe-l-hart Oct 13, 2025
112d339
test: Allow multiple verbose flags to fully print tensors
gabe-l-hart Oct 15, 2025
78e137f
feat(llama-gguf): Print out the tensor type in llama-gguf r
gabe-l-hart Sep 26, 2025
e5587cb
feat(ggml-metal): Efficient implementation of cumsum for metal
gabe-l-hart Oct 15, 2025
0468b99
test: More verbose printing and better cumsum tests
gabe-l-hart Oct 15, 2025
c71e35e
fix(ggml-metal): better granularity for support bool for CUMSUM and TRI
gabe-l-hart Oct 15, 2025
5f0d2a1
feat(ggml-metal): Metal impl of tri
gabe-l-hart Oct 15, 2025
426580d
Merge remote-tracking branch 'origin/master' into Mamba2SSD
gabe-l-hart Oct 15, 2025
ba3b8db
fix(ggml-cpu): Fix warnings from build with gcc
gabe-l-hart Oct 15, 2025
dfae909
feat(ggml-cuda): common implementation of prefix sum
gabe-l-hart Oct 16, 2025
d1f8658
feat(ggml-cuda): CUDA implementation of CUMSUM
gabe-l-hart Oct 16, 2025
5071fbd
feat(ggml-cuda): CUDA implementation of TRI
gabe-l-hart Oct 16, 2025
be23a29
test: Add test-backend-ops perf tests for ssm conv and scan
gabe-l-hart Sep 25, 2025
71e2289
feat(ggml-cpu): Rename ggml_softplus to ggml_op_softplus to make room…
gabe-l-hart Oct 17, 2025
f6d60e3
feat(ggml-cpu): Add ggml_softplus tensor op for CPU
gabe-l-hart Oct 17, 2025
778e835
test: Better verbosity output for inputs in test-backend-ops
gabe-l-hart Oct 17, 2025
4228002
feat(ggml-metal): Add ggml_softplus support for metal
gabe-l-hart Oct 17, 2025
97bd17d
feat(ggml-cuda): Add support for ggml_softplus
gabe-l-hart Oct 17, 2025
ffd88ff
style: comments on ggml tri types
gabe-l-hart Oct 20, 2025
7409d9e
WIP(llama-model): Partial work on graph-based SSD implementation
gabe-l-hart Oct 20, 2025
ba74006
TEMP: Increase the max graph nodes to handle all the nodes for SSD
gabe-l-hart Oct 21, 2025
29b30c6
WIP: Shape-correct impl of SSD w/out multi-chunk support
gabe-l-hart Oct 21, 2025
fb68967
fix: Add names to tensors for better debugging and fix several wiring…
gabe-l-hart Oct 23, 2025
cd73f4d
fix(wip): Fix matmul order for CB and y
gabe-l-hart Oct 23, 2025
52be1ab
fix: Working output!!
gabe-l-hart Oct 23, 2025
f57dafe
feat(eval-callback): Use -vb to set tensor print width and number of …
gabe-l-hart Oct 24, 2025
8a87063
feat(ggml-cpu): Add ggml_tri_dims to support non-standard dims (with …
gabe-l-hart Oct 24, 2025
79bce3e
feat(ggml-metal): Extend metal tri imple for arbitrary dims and non-c…
gabe-l-hart Oct 24, 2025
1ceb15e
feat(ggml-cuda): Extend CUDA impl of tri to support arbitrary dims an…
gabe-l-hart Oct 24, 2025
ef12069
fix: Fix INT_MAX to use numeric_limits for better compiler compat
gabe-l-hart Oct 24, 2025
3da5c97
fix(temp): Fix CBdecay to make decay contiguous for metal
gabe-l-hart Oct 24, 2025
3336f3c
fix: Use ggml_tri_dims to avoid perm/cont for initial decay step
gabe-l-hart Oct 24, 2025
d1e15c0
feat(ggml-cpu): Add dim arg to ggml_cumsum
gabe-l-hart Oct 24, 2025
ee13af1
feat(ggml-metal): Support arbitrary dim and non-cont in cumsum
gabe-l-hart Oct 24, 2025
3b4055e
feat(ggml-cuda): Support arbitrary dims and non-cont in cumsum
gabe-l-hart Oct 24, 2025
3963a72
feat(wip): Partially working implementation with update from previous…
gabe-l-hart Oct 28, 2025
188ae84
refact: Avoid permute and cont for first cumsum
gabe-l-hart Oct 28, 2025
0441ccb
fix: Subset input states to match ids
gabe-l-hart Oct 29, 2025
aba30d6
fix: Fix the chunk size computation
gabe-l-hart Oct 29, 2025
62ac897
fix: Fix handling of batch size > 1 in chunk updates
gabe-l-hart Oct 29, 2025
36244fe
fix: Fix permutation for nemotron-h shape
gabe-l-hart Oct 29, 2025
5ff37fa
Merge remote-tracking branch 'origin/master' into Mamba2SSD
gabe-l-hart Nov 3, 2025
8b6f38a
feat(off-topic): print the number of elements in tensors with llama-gguf
gabe-l-hart Nov 4, 2025
82bba1d
feat(ggml-cpu): Add f16 and bf16 support for ssm_conv
gabe-l-hart Nov 4, 2025
7ad0f37
feat(llama-quant): Allow F16 and BF16 quants of ssm_conv1d.weight
gabe-l-hart Nov 4, 2025
6256f9a
feat(ggml-cpu): Add partial implementation of scale for f16
gabe-l-hart Nov 4, 2025
204cd80
feat(wip): Use type_k/type_v for hybrid cache types
gabe-l-hart Nov 4, 2025
86788a2
temp: Cast ssm to F32
gabe-l-hart Nov 4, 2025
de43d0b
feat(ggml-metal): Add support for F16 and BF16 ssm_conv weights
gabe-l-hart Nov 4, 2025
426a97c
feat: Keep ssm in f16 until output on SSD code path
gabe-l-hart Nov 5, 2025
6733bda
feat: Remove sub-ubatch batching
gabe-l-hart Nov 5, 2025
4435600
Merge remote-tracking branch 'origin/master' into Mamba2SSD
gabe-l-hart Nov 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,17 @@

#include <cstdio>
#include <string>
#include <sstream>
#include <vector>
#include <numeric>
#include <limits>

// verbosity flag set via the params.verbosity CLI flag. This is used for two
// things:
// 1. If > 0, tensors are printed with 8 digits of precision instead of 5
// 2. If > 1, all tensor values are printed instead of the pretty-printed
// partial output
static int verbosity = 0;

/**
* This the arbitrary data which will be passed to each callback.
Expand Down Expand Up @@ -61,6 +70,10 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t *
}

static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
std::stringstream ss;
const int float_digits = verbosity > 0 ? 8 : 4;
ss << "%12." << float_digits << "f";
const auto float_fmt = ss.str();
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
Expand Down Expand Up @@ -93,7 +106,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
i0 = ne[0] - n;
}
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
LOG("%12.4f", v);
LOG(float_fmt.c_str(), v);
if (i0 < ne[0] - 1) LOG(", ");
}
LOG("],\n");
Expand Down Expand Up @@ -153,8 +166,9 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
}

if (!ggml_is_quantized(t->type)) {
const int print_width = verbosity > 1 ? std::numeric_limits<int>::max() : 3;
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
ggml_print_tensor(data, t->type, t->ne, t->nb, print_width);
}

return true;
Expand Down Expand Up @@ -192,6 +206,9 @@ int main(int argc, char ** argv) {

common_init();

// set verbosity for printing
verbosity = params.verbosity;

llama_backend_init();
llama_numa_init(params.numa);

Expand Down
47 changes: 47 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ extern "C" {
GGML_OP_COS,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_CUMSUM,
GGML_OP_MEAN,
GGML_OP_ARGMAX,
GGML_OP_COUNT_EQUAL,
Expand Down Expand Up @@ -530,6 +531,7 @@ extern "C" {
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
GGML_OP_LEAKY_RELU,
GGML_OP_TRI,

GGML_OP_FLASH_ATTN_EXT,
GGML_OP_FLASH_ATTN_BACK,
Expand Down Expand Up @@ -582,6 +584,7 @@ extern "C" {
GGML_UNARY_OP_CEIL,
GGML_UNARY_OP_ROUND,
GGML_UNARY_OP_TRUNC,
GGML_UNARY_OP_SOFTPLUS,

GGML_UNARY_OP_COUNT,
};
Expand Down Expand Up @@ -620,6 +623,13 @@ extern "C" {
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
};

enum ggml_tri_type {
GGML_TRI_TYPE_UPPER_DIAG = 0, // upper including diag
GGML_TRI_TYPE_UPPER = 1, // upper excluding diag
GGML_TRI_TYPE_LOWER_DIAG = 2, // lower including diag
GGML_TRI_TYPE_LOWER = 3 // lower excluding diag
};

struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
Expand Down Expand Up @@ -983,6 +993,17 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

// Cumulative sum along the specified dimension
GGML_API struct ggml_tensor * ggml_cumsum(
struct ggml_context * ctx,
struct ggml_tensor * a,
int dim);

// Convenience function: cumulative sum along dimension 0
GGML_API struct ggml_tensor * ggml_cumsum_0(
struct ggml_context * ctx,
struct ggml_tensor * a);

// mean along rows
GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx,
Expand Down Expand Up @@ -1194,6 +1215,11 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

// softplus(x) = log(1 + exp(beta * x)) / beta, for x * beta <= threshold
// = x, otherwise
GGML_API struct ggml_tensor * ggml_softplus(
struct ggml_context * ctx,
struct ggml_tensor * a);


// xIELU activation function
Expand Down Expand Up @@ -2187,6 +2213,27 @@ extern "C" {
int shift2,
int shift3);

// Make matrix into a triangular one (upper, upper + diagonal, lower or lower + diagonal) with constant value
// dim_x and dim_y specify which two dimensions to compare for triangular masking. They must have equal size.
// Default is dim_x=0, dim_y=1 (compares indices in dim 0 vs indices in dim 1)
GGML_API struct ggml_tensor * ggml_tri_dims(
struct ggml_context * ctx,
struct ggml_tensor * a,
float constant,
enum ggml_tri_type tritype,
int dim_x,
int dim_y);

GGML_API struct ggml_tensor * ggml_tri(
struct ggml_context * ctx,
struct ggml_tensor * a,
float constant,
enum ggml_tri_type tritype);

GGML_API struct ggml_tensor * ggml_tri_keep(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_tri_type tritype);

// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1731,6 +1731,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_sum_rows(params, tensor);
} break;
case GGML_OP_CUMSUM:
{
ggml_compute_forward_cumsum(params, tensor);
} break;
case GGML_OP_MEAN:
{
ggml_compute_forward_mean(params, tensor);
Expand Down Expand Up @@ -1943,6 +1947,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_leaky_relu(params, tensor);
} break;
case GGML_OP_TRI:
{
ggml_compute_forward_tri(params, tensor);
} break;
case GGML_OP_FLASH_ATTN_EXT:
{
ggml_compute_forward_flash_attn_ext(params, tensor);
Expand Down Expand Up @@ -2153,6 +2161,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGMAX:
case GGML_OP_CUMSUM:
case GGML_OP_TRI:
{
n_tasks = 1;
} break;
Expand Down Expand Up @@ -2192,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_XIELU:
case GGML_UNARY_OP_SOFTPLUS:
{
n_tasks = n_threads;
} break;
Expand Down
Loading
Loading