ggml-org · max-krasnyansky · Mar 6, 2026 · Feb 26, 2026 · Mar 5, 2026 · Mar 4, 2026
@@ -287,7 +287,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
+| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
 
 ## Obtaining and quantizing models

@@ -1865,15 +1865,26 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * dst  = op;
 
-    if (src0->type != GGML_TYPE_F32) {
-        return false;
+    if (src0->type == GGML_TYPE_F32) {
+        if (src1->type != GGML_TYPE_F32) {
+            return false;
+        }
+        if (dst->type != GGML_TYPE_F32) {
+            return false;
+        }
     }
-    if (src1->type != GGML_TYPE_F32) {
-        return false;
+    else if (src0->type == GGML_TYPE_F16) {
+        if (src1->type != GGML_TYPE_F16) {
+            return false;
+        }
+        if (dst->type != GGML_TYPE_F16) {
+            return false;
+        }
     }
-    if (dst->type != GGML_TYPE_F32) {
+    else {
         return false;
     }
+
     if (!ggml_are_same_shape(src0, dst)) {
         return false;
     }

@@ -693,8 +693,8 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
             return HTP_STATUS_NO_SUPPORT;
     }
 
-    const uint32_t n_threads  = octx->n_threads;
     const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    const uint32_t n_threads  = MIN(octx->n_threads, src0_nrows);
 
     size_t src0_row_size = src0->nb[1];
     size_t src1_row_size = src1->nb[1]; // zero bytes if src1 is not used
@@ -748,13 +748,11 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
-    uint32_t n_jobs = MIN(n_threads, src0_nrows);
-
     // Prepare context
     struct htp_act_context actx;
     actx.octx = octx;
 
-    actx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+    actx.src0_nrows_per_thread = (src0_nrows + n_threads - 1) / n_threads;
 
     actx.src0_row_size = src0_row_size;
     actx.src1_row_size = src1_row_size;
@@ -794,7 +792,7 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
     actx.data_src1 = data_src1;
     actx.data_dst  = (uint8_t *) dst->data;
 
-    worker_pool_run_func(octx->ctx->worker_pool, act_op_func, &actx, n_jobs);
+    worker_pool_run_func(octx->ctx->worker_pool, act_op_func, &actx, n_threads);
     return HTP_STATUS_OK;
 }
 

@@ -241,6 +241,9 @@ int op_argsort(struct htp_ops_context * octx) {
         return HTP_STATUS_NO_SUPPORT;
     }
 
+    const uint32_t total_rows = octx->src0.ne[1] * octx->src0.ne[2] * octx->src0.ne[3];
+    const uint32_t n_threads = MIN(total_rows, octx->n_threads);
+
     // Allocate scratchpad
     // We need 1 row of float + 1 row of int32 per thread.
     uint32_t ne00 = octx->src0.ne[0];
@@ -251,7 +254,7 @@ int op_argsort(struct htp_ops_context * octx) {
     // Make sure we round up to 256 for alignment requirements
     spad_per_thread = hex_round_up(spad_per_thread, 256);
 
-    size_t total_spad_size = spad_per_thread * octx->n_threads;
+    size_t total_spad_size = spad_per_thread * n_threads;
 
     if (octx->ctx->vtcm_size < total_spad_size) {
         FARF(ERROR, "argsort: VTCM size too small. Needed %zu, have %zu", total_spad_size, octx->ctx->vtcm_size);
@@ -267,15 +270,12 @@ int op_argsort(struct htp_ops_context * octx) {
          octx->dst.ne[0], octx->dst.ne[1], octx->dst.ne[2], octx->dst.ne[3],
          octx->src0.data, octx->dst.data);
 
-    uint32_t total_rows = octx->src0.ne[1] * octx->src0.ne[2] * octx->src0.ne[3];
-    uint32_t n_jobs = MIN(total_rows, octx->n_threads);
-
     struct htp_argsort_context actx;
     actx.octx = octx;
-    actx.nrows_per_thread = (total_rows + n_jobs - 1) / n_jobs;
+    actx.nrows_per_thread = (total_rows + n_threads - 1) / n_threads;
 
     // Run jobs
-    worker_pool_run_func(octx->ctx->worker_pool, htp_argsort_f32, &actx, n_jobs);
+    worker_pool_run_func(octx->ctx->worker_pool, htp_argsort_f32, &actx, n_threads);
 
     return HTP_STATUS_OK;
 }
@@ -202,6 +202,8 @@ static void cpy_work_func(unsigned int n, unsigned int i, void *data) {
 int op_cpy(struct htp_ops_context * octx) {
     cpy_preamble;
 
+    const uint32_t n_threads = MIN(nr, octx->n_threads);
+
     struct htp_copy_context ct;
     ct.octx = octx;
 
@@ -227,8 +229,7 @@ int op_cpy(struct htp_ops_context * octx) {
     const bool transposed = (nb00 > nb01) || (nb0 > nb1);
     const bool sameshape  = !transposed && (ne00 == ne0 && ne01 == ne1 && ne02 == ne2 && ne03 == ne3);
 
-    const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    ct.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+    ct.src0_nrows_per_thread = (nr + n_threads - 1) / n_threads;
 
     if (sametype && sameshape) {
         ct.copy = cpy_thread_sametype_sameshape;
@@ -245,7 +246,7 @@ int op_cpy(struct htp_ops_context * octx) {
         return HTP_STATUS_NO_SUPPORT;
     }
 
-    worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_jobs);
+    worker_pool_run_func(octx->ctx->worker_pool, cpy_work_func, &ct, n_threads);
 
     return HTP_STATUS_OK;
 }
@@ -82,6 +82,8 @@ static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *da
 int op_get_rows(struct htp_ops_context * octx) {
     get_rows_preamble;
 
+    const uint32_t n_threads = MIN(nr, octx->n_threads);
+
     if (octx->src0.type != HTP_TYPE_F32) {
         return HTP_STATUS_NO_SUPPORT;
     }
@@ -103,9 +105,8 @@ int op_get_rows(struct htp_ops_context * octx) {
     grctx.get_rows_div_ne10      = init_fastdiv_values(octx->src1.ne[0]);
     grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
 
-    const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    grctx.src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+    grctx.src1_nrows_per_thread = (nr + n_threads - 1) / n_threads;
 
-    worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_jobs);
+    worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_threads);
     return HTP_STATUS_OK;
 }