From 4a9012b43c129fe19161a5299a35997fcab9e0c6 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 8 Jan 2026 09:23:28 +0100 Subject: [PATCH 1/2] debug : include LLAMA_POOLING_TYPE_UNSPECIFIED in pooling check This commit updates the pooling check in the debug example to also include LLAMA_POOLING_TYPE_UNSPECIFIED and not just LLAMA_POOLING_TYPE_NONE. --- examples/debug/debug.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index 9bc5d0abfd2..37618443458 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -57,6 +57,16 @@ struct callback_data { } }; +static bool has_pooling(llama_context * ctx) { + switch (llama_pooling_type(ctx)) { + case LLAMA_POOLING_TYPE_NONE: + case LLAMA_POOLING_TYPE_UNSPECIFIED: + return false; + default: + return true; + } +} + struct output_data { float * data_ptr = nullptr; int data_size = 0; @@ -74,7 +84,7 @@ struct output_data { if (params.embedding) { const int n_embd = llama_model_n_embd_out(model); - const bool pooling_enabled = llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE; + const bool pooling_enabled = has_pooling(ctx); const int n_embd_count = pooling_enabled ? 1 : tokens.size(); const int n_embeddings = n_embd * n_embd_count; From f31c38dd19d012750146945ed930637d6a1050f2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 8 Jan 2026 11:05:16 +0100 Subject: [PATCH 2/2] debug : normalize both pooled and token embeddings This commit updates debug.cpp to normalize embeddings for both pooled and non-pooled outputs. For pooled embeddings, normalization is applied to the single vector, and for non-pooled embeddings, normalization is applied to each token embedding vector individually. The motivation for this is to enable non-pooled embeddings to be normalized which was not possible previously. --- examples/debug/debug.cpp | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/examples/debug/debug.cpp b/examples/debug/debug.cpp index 37618443458..63be40c8425 100644 --- a/examples/debug/debug.cpp +++ b/examples/debug/debug.cpp @@ -71,7 +71,7 @@ struct output_data { float * data_ptr = nullptr; int data_size = 0; std::string type_suffix; - std::vector storage; + std::vector embd_norm; std::string prompt; std::vector tokens; @@ -83,24 +83,32 @@ struct output_data { prompt = params.prompt; if (params.embedding) { - const int n_embd = llama_model_n_embd_out(model); - const bool pooling_enabled = has_pooling(ctx); - const int n_embd_count = pooling_enabled ? 1 : tokens.size(); - const int n_embeddings = n_embd * n_embd_count; - - float * embeddings; - if (pooling_enabled) { - embeddings = llama_get_embeddings_seq(ctx, 0); - storage.resize(n_embeddings); - common_embd_normalize(embeddings, storage.data(), n_embeddings, params.embd_normalize); - embeddings = storage.data(); - } else { - embeddings = llama_get_embeddings(ctx); + const int n_embd = llama_model_n_embd_out(model); + const bool pooling = has_pooling(ctx); + const int n_embd_count = pooling ? 1 : tokens.size(); + const int n_floats = n_embd * n_embd_count; + + float * embd_raw = pooling ? llama_get_embeddings_seq(ctx, 0) : llama_get_embeddings(ctx); + if (embd_raw == nullptr) { + throw std::runtime_error("failed to get embeddings from the model"); } - data_ptr = embeddings; - data_size = n_embeddings; + LOG_DBG("pooling_enabled: %s\n", pooling ? "true" : "false"); + LOG_DBG("n_embd: %d\n", n_embd); + LOG_DBG("n_floats: %d\n", n_floats); + LOG_DBG("n_embd_count: %d\n", n_embd_count); + + data_ptr = embd_raw; + data_size = n_floats; type_suffix = "-embeddings"; + + if (params.embd_normalize >= 0) { + embd_norm.resize(n_floats); + for (int i = 0; i < n_embd_count; i++) { + common_embd_normalize(embd_raw+i*n_embd, embd_norm.data()+i*n_embd, n_embd, params.embd_normalize); + } + data_ptr = embd_norm.data(); + } } else { const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1); const int n_logits = llama_vocab_n_tokens(vocab);