auroralabs-loci
diff --git a/‎common/arg.cpp‎
Lines changed: 7 additions & 0 deletions b/‎common/arg.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎common/common.h‎
Lines changed: 1 addition & 0 deletions b/‎common/common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/sampling.cpp‎
Lines changed: 39 additions & 0 deletions b/‎common/sampling.cpp‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎common/speculative.cpp‎
Lines changed: 113 additions & 0 deletions b/‎common/speculative.cpp‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎common/speculative.h‎
Lines changed: 43 additions & 4 deletions b/‎common/speculative.h‎
Lines changed: 43 additions & 4 deletions
diff --git a/‎include/llama.h‎
Lines changed: 46 additions & 0 deletions b/‎include/llama.h‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎src/llama-arch.cpp‎
Lines changed: 7 additions & 6 deletions b/‎src/llama-arch.cpp‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/llama-batch.cpp‎
Lines changed: 18 additions & 17 deletions b/‎src/llama-batch.cpp‎
Lines changed: 18 additions & 17 deletions
@@ -3214,6 +3214,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.cache_type_k = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
+    add_opt(common_arg(
+        {"-mtp", "--multi-token-prediction"},
+        string_format("Activate multi-token-prediction (if supported) (default: %s)", params.mtp ? "true" : "false"),
+        [](common_params & params) {
+            params.mtp = true;
+        }
+    ));
     add_opt(common_arg(
         {"-ctvd", "--cache-type-v-draft"}, "TYPE",
         string_format(
 
@@ -430,6 +430,7 @@ struct common_params {
     bool no_op_offload     = false; // globally disable offload host tensor operations to device
     bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
     bool no_host           = false; // bypass host buffer allowing extra buffers to be used
+    bool mtp               = false; // use mtp is supported
 
     bool single_turn       = false; // single turn chat conversation
 
 
@@ -666,3 +666,42 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
 
     return samplers;
 }
+
+/**
+ * Specialized sampling for speculative drafting.
+ * 
+ * Prioritizes performance by using a direct ArgMax loop (Greedy) when no 
+ * penalties (repetition, frequency, presence, DRY) are configured.
+ * Falls back to the full sampler chain if penalties are active to prevent 
+ * generative loops or adhere to constraints.
+ */
+llama_token common_sampler_sample_speculative(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
+    const auto & params = gsmpl->params;
+
+    bool use_heavy_sampler = 
+        (params.penalty_last_n > 0 && (
+            params.penalty_repeat  != 1.0f || 
+            params.penalty_freq    != 0.0f || 
+            params.penalty_present != 0.0f
+        )) ||
+        (params.dry_allowed_length > 0 && params.dry_multiplier != 0.0f);
+
+    if (use_heavy_sampler) {
+        return common_sampler_sample(gsmpl, ctx, idx, false);
+    } 
+
+    float * logits = llama_get_logits_ith(ctx, idx);
+    const int n_vocab = llama_n_vocab(llama_model_get_vocab(llama_get_model(ctx)));
+
+    int best_id = 0;
+    float max_val = logits[0];
+
+    for (int i = 1; i < n_vocab; ++i) {
+        if (logits[i] > max_val) {
+            max_val = logits[i];
+            best_id = i;
+        }
+    }
+
+    return best_id;
+}
@@ -359,3 +359,116 @@ llama_tokens common_speculative_gen_draft(
     }
     return result;
 }
+
+llama_tokens mtp_speculative_gen_draft(
+    struct common_sampler* smpl,
+    struct llama_context* ctx,
+    struct common_speculative_params params,
+    llama_token id_last,
+    int32_t n_past,
+    llama_seq_id seq_id) {
+
+    int n_draft = params.n_draft; 
+    
+    llama_tokens drafts;
+    drafts.reserve(n_draft);
+
+    if (!smpl) return drafts;
+
+    llama_batch mtp_batch = llama_batch_init(1, 0, 1);
+    mtp_batch.mtp_params.op_type = MTP_OP_DRAFT_GEN;
+
+    llama_token current_input_id = id_last;
+    int32_t current_n_past = n_past;
+
+    for (int i = 0; i < n_draft; ++i) {
+        mtp_batch.n_tokens = 0;
+        common_batch_add(mtp_batch, current_input_id, current_n_past, {seq_id}, true);
+
+        // Perform the MTP draft generation decode. This writes the MTP layer's
+        // KV state for the draft token into the cache.
+        if (llama_decode(ctx, mtp_batch) != 0) {
+            break;
+        }
+
+        llama_token id_next = common_sampler_sample_speculative(smpl, ctx, 0);
+
+        // Drafting stops if token probability drops below `p_min` to save compute.
+        const auto * cur_p = common_sampler_get_candidates(smpl, true);
+        if (cur_p && cur_p->size > 0) {
+            float prob = cur_p->data[0].p;
+
+            if (prob < params.p_min) {
+                drafts.push_back(id_next);
+                current_n_past++; 
+                break;
+            }
+        }
+
+        drafts.push_back(id_next);
+        
+        current_input_id = id_next;
+        current_n_past++;
+    }
+    llama_batch_free(mtp_batch);
+
+    // CRITICAL: Purge the metadata for the draft token we just wrote.
+    // This makes the physical cell available again for the main model's validation pass,
+    // preventing a cache state corruption where two cells map to the same logical position.
+    if (!drafts.empty()) {
+        llama_kv_cache_seq_rm(ctx, seq_id, n_past, current_n_past);
+    }
+
+    return drafts;
+}
+
+
+void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, bool is_prompt_warmup) {
+    if (batch.n_tokens == 0) {
+        return;
+    }
+
+    LOG_DBG("[MTP-UPDATE|%s] Updating %d tokens...\n", is_prompt_warmup ? "PROMPT_WARMUP" : "GEN_ACCEPTED", batch.n_tokens);
+
+    llama_batch mtp_batch = batch;
+    if (is_prompt_warmup) {
+        mtp_batch.mtp_params.op_type = MTP_OP_WARMUP;
+    } else {
+        mtp_batch.mtp_params.op_type = MTP_OP_UPDATE_ACCEPTED;
+    }
+
+    for (int i = 0; i < mtp_batch.n_tokens; ++i) {
+        mtp_batch.logits[i] = true;
+    }
+    llama_decode(ctx, mtp_batch);
+}
+
+void mtp_accept_tokens(
+    struct llama_context * ctx,
+    const std::vector<llama_token> & ids,
+    int32_t n_past_base,
+    llama_seq_id seq_id
+) {
+    if (ids.empty()) {
+        return;
+    }
+
+    // Prepare a resized copy of the validation sinfo to match the number of accepted tokens.
+    //    This sets up the context for a "forced sinfo" decode.
+    if (!llama_mtp_prepare_sinfo_for_update(ctx, ids.size())) {
+        return;
+    }
+
+    // Build a new batch containing only the accepted tokens.
+    llama_batch accepted_batch = llama_batch_init(ids.size(), 0, 1);
+    for (size_t i = 0; i < ids.size(); ++i) {
+        common_batch_add(accepted_batch, ids[i], n_past_base + i, { seq_id }, true);
+    }
+
+    mtp_update_kv_cache(ctx, accepted_batch, false);
+
+    // Clean up the forced state to not affect subsequent, normal decode calls.
+    llama_mtp_cancel_sinfo_update(ctx);
+
+    llama_batch_free(accepted_batch);
+}
@@ -12,6 +12,12 @@ struct common_speculative_params {
     float p_min = 0.75f; // min probability required to accept a token in the draft
 };
 
+struct mtp_kv_update_data {
+    llama_token id;
+    int32_t n_past;
+    int32_t tok_idx;
+};
+
 struct common_speculative * common_speculative_init(
         struct llama_context * ctx_tgt,
         struct llama_context * ctx_dft
@@ -29,7 +35,40 @@ void common_speculative_add_replacement_tgt_dft(
 
 // sample up to n_draft tokens and add them to the batch using the draft model
 llama_tokens common_speculative_gen_draft(
-               struct common_speculative * spec,
-        struct common_speculative_params   params,
-                      const llama_tokens & prompt,
-                             llama_token   id_last);
+    struct common_speculative * spec,
+    struct common_speculative_params   params,
+    const llama_tokens & prompt,
+    llama_token   id_last);
+
+/**
+ * @brief Generates speculative draft tokens using the Multi-Token Prediction (MTP) architecture.
+ * 
+ * This function performs a recursive generation loop using the MTP head (e.g., Eagle/NextN).
+ * It uses the fixed hidden state from the main model's last step and updates the MTP layer's 
+ * internal KV cache autoregressively.
+ * 
+ * @param smpl      The sampler instance.
+ * @param ctx       The llama context (shared between Main and MTP).
+ * @param params    Speculative parameters (n_draft, p_min).
+ * @param id_last   The last confirmed token ID from the main model.
+ * @param n_past    The number of tokens in the validated past (start position for drafting).
+ * @param seq_id    The sequence ID to use for drafting.
+ * 
+ * @return std::vector<llama_token> The generated draft tokens.
+ */
+llama_tokens mtp_speculative_gen_draft(
+    struct common_sampler* smpl,
+    struct llama_context* ctx,
+    struct common_speculative_params params,
+    llama_token id_last,
+    int32_t n_past,
+    llama_seq_id seq_id);
+
+void mtp_update_kv_cache(struct llama_context * ctx, const llama_batch& batch, bool is_prompt_warmup);
+
+void mtp_accept_tokens(
+    struct llama_context * ctx,
+    const std::vector<llama_token> & ids,
+    int32_t n_past_base,
+    llama_seq_id seq_id
+);
@@ -228,6 +228,17 @@ extern "C" {
     //               - if not:        only the last token is output
     //            )
     //
+    typedef enum {
+        MTP_OP_NONE,
+        MTP_OP_WARMUP,
+        MTP_OP_UPDATE_ACCEPTED,
+        MTP_OP_DRAFT_GEN,
+    } llama_mtp_op_type;
+
+    typedef struct llama_mtp_params {
+        llama_mtp_op_type op_type;
+    } llama_mtp_params;
+
     typedef struct llama_batch {
         int32_t n_tokens;
 
@@ -237,6 +248,7 @@ extern "C" {
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
+        llama_mtp_params mtp_params;
     } llama_batch;
 
     enum llama_model_kv_override_type {
@@ -536,6 +548,8 @@ extern "C" {
 
     LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
 
+    LLAMA_API int32_t llama_model_n_nextn_layer(const struct llama_model * model);
+
     // Functions to access the model's GGUF metadata scalar values
     // - The functions return the length of the string on success, or -1 on failure
     // - The output string is always null-terminated and cleared on failure
@@ -1442,6 +1456,38 @@ extern "C" {
             ggml_opt_epoch_callback   callback_train,
             ggml_opt_epoch_callback   callback_eval);
 
+    //
+    // MTP
+    //
+
+    LLAMA_API void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state);
+
+    /**
+     * @brief Prepares the context for an MTP KV cache update by creating a resized copy of the last sinfo.
+     *        This is used after speculative validation when only a subset of draft tokens are accepted.
+     * @param n_accepted The number of tokens that were accepted and for which the sinfo should be resized.
+     * @return true on success.
+     */
+    LLAMA_API bool llama_mtp_prepare_sinfo_for_update(struct llama_context * ctx, size_t n_accepted);
+    
+    /**
+     * @brief Prepares the context for an MTP KV cache update by reusing the sinfo from the last main model decode.
+     *        This is used for the prompt warmup to ensure the MTP and main model KV caches are perfectly aligned.
+     * @return true on success.
+     */
+    LLAMA_API bool llama_mtp_prepare_sinfo_for_warmup(struct llama_context * ctx);
+    
+    /**
+     * @brief Clears the forced sinfo state from the context. Must be called after a decode that used a prepared sinfo.
+     */
+    LLAMA_API void llama_mtp_cancel_sinfo_update(struct llama_context * ctx);
+
+    /**
+     * @brief Removes KV cache metadata for a specified sequence and token range.
+     *        This makes the physical cells logically available again without deleting the tensor data.
+     */
+    LLAMA_API void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+
 #ifdef __cplusplus
 }
 #endif
 
@@ -2370,12 +2370,13 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_VISEXP_FFN_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     // NextN/MTP tensors are currently ignored (reserved for future MTP support)
     // These tensors only exist in the last layer(s) and are treated as output tensors
-    {LLM_TENSOR_NEXTN_EH_PROJ,              {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_NEXTN_EMBED_TOKENS,         {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
-    {LLM_TENSOR_NEXTN_ENORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
-    {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
-    {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    // Changed to LLM_TENSOR_LAYER_REPEATING because we saved these under a blk with a non-negative id
+    {LLM_TENSOR_NEXTN_EH_PROJ,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_NEXTN_EMBED_TOKENS,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_NEXTN_ENORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
 
@@ -301,17 +301,17 @@ bool llama_batch_allocr::init(
                     ok = false;
                 }
 
-                if (!ok) {
-                    LLAMA_LOG_ERROR(
-                            "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
-                            " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
-                            " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
-                            " it is required that the sequence positions remain consecutive: Y = X + 1\n",
-                            __func__, s, s, p0, s, seq_pos_min(s));
+            // if (!ok) {
+            //     LLAMA_LOG_ERROR(
+            //             "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
+            //             " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
+            //             " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n"
+            //             " it is required that the sequence positions remain consecutive: Y = X + 1\n",
+            //             __func__, s, s, p0, s, seq_pos_min(s));
 
-                    return false;
-                }
-            }
+            //     return false;
+            // }
+        }
 
             if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
                 LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
@@ -874,13 +874,14 @@ struct llama_batch llama_batch_get_one(
 
 struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
     llama_batch batch = {
-        /*n_tokens =*/ 0,
-        /*tokens   =*/ nullptr,
-        /*embd     =*/ nullptr,
-        /*pos      =*/ nullptr,
-        /*n_seq_id =*/ nullptr,
-        /*seq_id   =*/ nullptr,
-        /*logits   =*/ nullptr,
+        /*n_tokens    =*/ 0,
+        /*tokens      =*/ nullptr,
+        /*embd        =*/ nullptr,
+        /*pos         =*/ nullptr,
+        /*n_seq_id    =*/ nullptr,
+        /*seq_id      =*/ nullptr,
+        /*logits      =*/ nullptr,
+        /*.mtp_params =*/ { MTP_OP_NONE },
     };
 
     if (embd) {