@@ -1155,6 +1155,14 @@ struct llama_server_context
11551155 slot.has_next_token = false ;
11561156 }
11571157
1158+ if (slot.n_past >= slot.n_ctx ) {
1159+ slot.truncated = true ;
1160+ slot.stopped_limit = true ;
1161+ slot.has_next_token = false ;
1162+
1163+ LOG_VERBOSE (" stopped due to running out of context capacity" , {});
1164+ }
1165+
11581166 if (result.tok == llama_vocab_eos (vocab) || llama_vocab_is_eog (vocab, result.tok ))
11591167 {
11601168 slot.stopped_eos = true ;
@@ -1627,17 +1635,17 @@ struct llama_server_context
16271635 {
16281636 if (slot.is_processing () && system_tokens.size () + slot.cache_tokens .size () >= (size_t ) slot.n_ctx )
16291637 {
1638+ // this check is redundant (for good)
1639+ // we should never get here, because generation should already stopped in process_token()
1640+
16301641 // START LOCALAI changes
16311642 // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
16321643 // See: https://github.com/mudler/LocalAI/issues/1333
16331644 // Context is exhausted, release the slot
16341645 slot.release ();
16351646 send_final_response (slot);
1636- slot.cache_tokens .clear ();
1637- slot.n_past = 0 ;
1638- slot.truncated = false ;
1639- slot.has_next_token = true ;
1640- LOG (" Context exhausted. Slot %d released (%d tokens in cache)\n " , slot.id , (int ) slot.cache_tokens .size ());
1647+ slot.has_next_token = false ;
1648+ LOG_ERROR (" context is exhausted, release the slot" , {});
16411649
16421650 continue ;
16431651 // END LOCALAI changes
0 commit comments