@@ -1631,12 +1631,11 @@ void server_context::send_final_response(server_slot& slot) {
16311631 res->timings = slot.get_timings ();
16321632 res->post_sampling_probs = slot.params .post_sampling_probs ;
16331633 res->oaicompat = slot.params .oaicompat ;
1634- res->oaicompat_model = slot.params .oaicompat_model ;
16351634 res->oaicompat_cmpl_id = slot.params .oaicompat_cmpl_id ;
16361635 res->oaicompat_msg = slot.update_chat_msg (res->oaicompat_msg_diffs );
16371636 res->n_decoded = slot.n_decoded ;
16381637 res->n_prompt_tokens = slot.n_prompt_tokens ;
1639- res->oaicompat_model = slot.oaicompat_model ;
1638+ res->oaicompat_model = slot.task -> params . oaicompat_model ;
16401639 res->data = json{
16411640 {" content" , !slot.params .stream ? slot.generated_text : " " },
16421641 {" generated_text" , slot.generated_text }, // Always include full text for finish_reason logic
@@ -2590,9 +2589,9 @@ void server_context::batch_pending_prompt(const int32_t n_ubatch, const int32_t
25902589
25912590 slot.state = SLOT_STATE_PROCESSING;
25922591 slot.command = SLOT_COMMAND_NONE;
2592+ send_final_response (slot);
25932593 slot.release ();
25942594 slot.print_timings ();
2595- send_final_response (slot);
25962595 continue ;
25972596 }
25982597
@@ -2933,9 +2932,9 @@ void server_context::speculative_decoding_accept() {
29332932
29342933 if (!process_token (result, slot)) {
29352934 // release slot because of stop condition
2935+ send_final_response (slot);
29362936 slot.release ();
29372937 slot.print_timings ();
2938- send_final_response (slot);
29392938 metrics.on_prediction (slot);
29402939 break ;
29412940 }
@@ -2953,7 +2952,7 @@ void server_context::speculative_decoding_accept() {
29532952
29542953bool server_context::accept_special_token (const server_slot& slot, const llama_token token) {
29552954 return params_base.special || slot.sparams .preserved_tokens .find (token) != slot.sparams .preserved_tokens .end ();
2956- };
2955+ }
29572956
29582957
29592958void server_context::send_token_results (completion_token_outputs& results, server_slot& slot, int32_t n) {
@@ -2962,9 +2961,9 @@ void server_context::send_token_results(completion_token_outputs& results, serve
29622961 bool has_next = process_token (it, slot);
29632962 count++;
29642963 if (!has_next) {
2964+ send_final_response (slot);
29652965 slot.release ();
29662966 slot.print_timings ();
2967- send_final_response (slot);
29682967 metrics.on_prediction (slot);
29692968 break ;
29702969 }
0 commit comments