@@ -2995,7 +2995,7 @@ void llama_opt_epoch(
29952995 callback_eval);
29962996}
29972997
2998- void llama_build_and_execute_mtp_graph (struct llama_context * ctx,
2998+ llama_token llama_build_and_execute_mtp_graph (struct llama_context * ctx,
29992999 const llama_batch batch_inp, llama_token last_token_id, int32_t n_past, int32_t last_tok_idx) {
30003000
30013001 const auto * model = llama_get_model (ctx);
@@ -3044,13 +3044,29 @@ void llama_build_and_execute_mtp_graph(struct llama_context * ctx,
30443044
30453045 ggml_backend_sched_graph_compute (sched, gf); // execute the graph
30463046
3047- struct ggml_tensor * logits_mtp = res_mtp->get_logits ();;
3047+ // struct ggml_tensor * logits_mtp = res_mtp->get_logits();
3048+
30483049 // LLAMA_LOG_INFO("logits_mtp pointer address: %p\n", (void*)logits_mtp);
30493050
3050- if (logits_mtp) {
3051- ctx->set_logits_ith (logits_mtp, sched, last_tok_idx);
3052- }
3051+ // if (logits_mtp) {
3052+ // ctx->set_logits_ith(logits_mtp, sched, last_tok_idx);
3053+ // }
3054+ struct ggml_tensor * token_id_tensor = ggml_get_tensor (res_mtp->get_ctx (), " mtp_argmax_result" );
3055+
3056+
3057+ llama_token token_id = 0 ; // The C++ variable to hold the result.
3058+
3059+ // ggml_backend_tensor_get is the function for GPU->CPU copies.
3060+ // We are copying a single 32-bit integer.
3061+ ggml_backend_tensor_get (
3062+ token_id_tensor,
3063+ &token_id, // Pointer to our C++ variable
3064+ 0 , // Starting offset in bytes
3065+ sizeof (llama_token) // Number of bytes to copy
3066+ );
30533067
30543068 ggml_backend_sched_free (sched);
3069+
3070+ return token_id;
30553071}
30563072
0 commit comments