ggml-org · gabe-l-hart · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 10, 2025
@@ -6,8 +6,17 @@
 
 #include <cstdio>
 #include <string>
+#include <sstream>
 #include <vector>
 #include <numeric>
+#include <limits>
+
+// verbosity flag set via the params.verbosity CLI flag. This is used for two
+// things:
+// 1. If > 0, tensors are printed with 8 digits of precision instead of 5
+// 2. If > 1, all tensor values are printed instead of the pretty-printed
+//      partial output
+static int verbosity = 0;
 
 /**
  * This the arbitrary data which will be passed to each callback.
@@ -61,6 +70,10 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t *
 }
 
 static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    std::stringstream ss;
+    const int float_digits = verbosity > 0 ? 8 : 4;
+    ss << "%12." << float_digits << "f";
+    const auto float_fmt = ss.str();
     GGML_ASSERT(n > 0);
     float sum = 0;
     for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -93,7 +106,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                         i0 = ne[0] - n;
                     }
                     const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
-                    LOG("%12.4f", v);
+                    LOG(float_fmt.c_str(), v);
                     if (i0 < ne[0] - 1) LOG(", ");
                 }
                 LOG("],\n");
@@ -153,8 +166,9 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     }
 
     if (!ggml_is_quantized(t->type)) {
+        const int print_width = verbosity > 1 ? std::numeric_limits<int>::max() : 3;
         uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+        ggml_print_tensor(data, t->type, t->ne, t->nb, print_width);
     }
 
     return true;
@@ -192,6 +206,9 @@ int main(int argc, char ** argv) {
 
     common_init();
 
+    // set verbosity for printing
+    verbosity = params.verbosity;
+
     llama_backend_init();
     llama_numa_init(params.numa);
 

@@ -475,6 +475,7 @@ extern "C" {
         GGML_OP_COS,
         GGML_OP_SUM,
         GGML_OP_SUM_ROWS,
+        GGML_OP_CUMSUM,
         GGML_OP_MEAN,
         GGML_OP_ARGMAX,
         GGML_OP_COUNT_EQUAL,
@@ -530,6 +531,7 @@ extern "C" {
         GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
+        GGML_OP_TRI,
 
         GGML_OP_FLASH_ATTN_EXT,
         GGML_OP_FLASH_ATTN_BACK,
@@ -582,6 +584,7 @@ extern "C" {
         GGML_UNARY_OP_CEIL,
         GGML_UNARY_OP_ROUND,
         GGML_UNARY_OP_TRUNC,
+        GGML_UNARY_OP_SOFTPLUS,
 
         GGML_UNARY_OP_COUNT,
     };
@@ -620,6 +623,13 @@ extern "C" {
         GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
 
+    enum ggml_tri_type {
+        GGML_TRI_TYPE_UPPER_DIAG        = 0, // upper including diag
+        GGML_TRI_TYPE_UPPER             = 1, // upper excluding diag
+        GGML_TRI_TYPE_LOWER_DIAG        = 2, // lower including diag
+        GGML_TRI_TYPE_LOWER             = 3  // lower excluding diag
+    };
+
     struct ggml_init_params {
         // memory pool
         size_t mem_size;   // bytes
@@ -983,6 +993,17 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // Cumulative sum along the specified dimension
+    GGML_API struct ggml_tensor * ggml_cumsum(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   dim);
+
+    // Convenience function: cumulative sum along dimension 0
+    GGML_API struct ggml_tensor * ggml_cumsum_0(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
     // mean along rows
     GGML_API struct ggml_tensor * ggml_mean(
             struct ggml_context * ctx,
@@ -1194,6 +1215,11 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // softplus(x) = log(1 + exp(beta * x)) / beta, for x * beta <= threshold
+    //             = x, otherwise
+    GGML_API struct ggml_tensor * ggml_softplus(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
 
 
     // xIELU activation function
@@ -2187,6 +2213,27 @@ extern "C" {
             int                   shift2,
             int                   shift3);
 
+    // Make matrix into a triangular one (upper, upper + diagonal, lower or lower + diagonal) with constant value
+    // dim_x and dim_y specify which two dimensions to compare for triangular masking. They must have equal size.
+    // Default is dim_x=0, dim_y=1 (compares indices in dim 0 vs indices in dim 1)
+    GGML_API struct ggml_tensor * ggml_tri_dims(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 constant,
+            enum ggml_tri_type    tritype,
+            int                   dim_x,
+            int                   dim_y);
+
+    GGML_API struct ggml_tensor * ggml_tri(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 constant,
+            enum ggml_tri_type    tritype);
+
+    GGML_API struct ggml_tensor * ggml_tri_keep(
+            struct ggml_context * ctx,
+            struct ggml_tensor * a,
+            enum ggml_tri_type tritype);
 
     // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
     // timesteps: [N,]

@@ -1731,6 +1731,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_sum_rows(params, tensor);
             } break;
+        case GGML_OP_CUMSUM:
+            {
+                ggml_compute_forward_cumsum(params, tensor);
+            } break;
         case GGML_OP_MEAN:
             {
                 ggml_compute_forward_mean(params, tensor);
@@ -1943,6 +1947,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_leaky_relu(params, tensor);
             } break;
+        case GGML_OP_TRI:
+            {
+                ggml_compute_forward_tri(params, tensor);
+            } break;
         case GGML_OP_FLASH_ATTN_EXT:
             {
                 ggml_compute_forward_flash_attn_ext(params, tensor);
@@ -2153,6 +2161,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_SUM_ROWS:
         case GGML_OP_MEAN:
         case GGML_OP_ARGMAX:
+        case GGML_OP_CUMSUM:
+        case GGML_OP_TRI:
             {
                 n_tasks = 1;
             } break;
@@ -2192,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_XIELU:
+                case GGML_UNARY_OP_SOFTPLUS:
                     {
                         n_tasks = n_threads;
                     } break;