From c62a495a02f2439ceef524c2e980190af95d23f3 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Fri, 12 Dec 2025 00:39:33 -0800 Subject: [PATCH 1/4] kv-cache : fix state restore with fragmented cache (#17527) Change find_slot to allow non-contiguous allocation during state restore. Fixes 'failed to find available cells in kv cache' error when restoring state to fragmented cache. --- src/llama-kv-cache.cpp | 91 ++++++++++---- src/llama-kv-cache.h | 18 ++- tests/CMakeLists.txt | 8 ++ tests/test-state-restore-fragmented.cpp | 156 ++++++++++++++++++++++++ 4 files changed, 244 insertions(+), 29 deletions(-) create mode 100644 tests/test-state-restore-fragmented.cpp diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 3e02bd62977..0d3a762fe01 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1549,9 +1549,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id]; + slot_info sinfo; + bool res = true; - res = res && state_read_meta(io, strm, cell_count, seq_id); - res = res && state_read_data(io, strm, cell_count); + res = res && state_read_meta(io, strm, cell_count, seq_id, sinfo); + res = res && state_read_data(io, strm, cell_count, sinfo); if (!res) { if (seq_id == -1) { @@ -1690,7 +1692,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t } } -bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) { +bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo) { auto & cells = v_cells[strm]; auto & head = v_heads[strm]; @@ -1727,7 +1729,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32 ubatch.seq_id[i] = &dest_seq_id; } - const auto sinfo = find_slot(ubatch, true); + sinfo = find_slot(ubatch, false); if (sinfo.empty()) { LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; @@ -1737,20 +1739,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32 // see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350 apply_ubatch(sinfo, ubatch); - const auto head_cur = sinfo.head(); - - // keep the head at the old position because we will read the KV data into it in state_read_data() - head = head_cur; - - LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id); + LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id); - // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values) - // Assume that this is one contiguous block of cells - GGML_ASSERT(head_cur + cell_count <= cells.size()); - GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]); - GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]); - GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id)); - GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id)); + // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values + GGML_ASSERT(sinfo.n_stream() == 1); + GGML_ASSERT(sinfo.idxs[0].size() == cell_count); + for (uint32_t i = 0; i < cell_count; ++i) { + const uint32_t idx = sinfo.idxs[0][i]; + GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]); + GGML_ASSERT(cells.seq_has(idx, dest_seq_id)); + } } else { // whole KV cache restore @@ -1783,15 +1781,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32 } } + // Create contiguous slot_info for whole cache restore + sinfo.s0 = strm; + sinfo.s1 = strm; + sinfo.resize(1); + sinfo.strm[0] = strm; + sinfo.idxs[0].resize(cell_count); + for (uint32_t i = 0; i < cell_count; ++i) { + sinfo.idxs[0][i] = i; + } + head = 0; } return true; } -bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) { +bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) { auto & cells = v_cells[strm]; - auto & head = v_heads[strm]; uint32_t v_trans; uint32_t n_layer; @@ -1841,8 +1848,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32 } if (cell_count) { - // Read and set the keys for the whole cell range - ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + if (sinfo.is_contiguous()) { + // Fast path: contiguous cells, single memcpy + ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row); + } else { + // Slow path: scatter to non-contiguous positions + const void * src = io.read(cell_count * k_size_row); + for (uint32_t i = 0; i < cell_count; ++i) { + const size_t dst_offset = sinfo.idxs[0][i] * k_size_row; + ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row); + } + } } } @@ -1873,8 +1889,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32 } if (cell_count) { - // Read and set the values for the whole cell range - ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + if (sinfo.is_contiguous()) { + // Fast path: contiguous cells, single memcpy + ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row); + } else { + // Slow path: scatter to non-contiguous positions + const void * src = io.read(cell_count * v_size_row); + for (uint32_t i = 0; i < cell_count; ++i) { + const size_t dst_offset = sinfo.idxs[0][i] * v_size_row; + ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row); + } + } } } } else { @@ -1913,10 +1938,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32 } if (cell_count) { - // For each row in the transposed matrix, read the values for the whole cell range - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - const size_t dst_offset = (head + j * cells.size()) * v_size_el; - ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + if (sinfo.is_contiguous()) { + // Fast path: contiguous cells + const uint32_t h = sinfo.head(); + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (h + j * cells.size()) * v_size_el; + ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } else { + // Slow path: scatter to non-contiguous positions + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const void * src = io.read(cell_count * v_size_el); + for (uint32_t i = 0; i < cell_count; ++i) { + const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el; + ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el); + } + } } } } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index bf7821c07ca..977159530fd 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -72,6 +72,20 @@ class llama_kv_cache : public llama_memory_i { void clear() { idxs.clear(); } + + // check if indices are contiguous starting from head() + bool is_contiguous() const { + if (idxs.empty() || idxs[0].empty()) { + return true; + } + const uint32_t h = idxs[0][0]; + for (size_t i = 0; i < idxs[0].size(); ++i) { + if (idxs[0][i] != h + i) { + return false; + } + } + return true; + } }; using slot_info_vec_t = std::vector; @@ -264,8 +278,8 @@ class llama_kv_cache : public llama_memory_i { void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const; void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const; - bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1); - bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count); + bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo); + bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo); }; class llama_kv_cache_context : public llama_memory_context_i { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9ba559c8dfb..ed97b1363f1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -222,6 +222,14 @@ llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") +# Test for state restore with fragmented KV cache +# Requires a model, uses same args pattern as test-thread-safety +if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") + llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -c 256 -np 3) +else() + llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -c 256 -np 3) +endif() + if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading llama_build_and_test(test-barrier.cpp) diff --git a/tests/test-state-restore-fragmented.cpp b/tests/test-state-restore-fragmented.cpp new file mode 100644 index 00000000000..b7e0d8d2612 --- /dev/null +++ b/tests/test-state-restore-fragmented.cpp @@ -0,0 +1,156 @@ +// Test for state restore with fragmented KV cache +// This tests the fix for: https://github.com/ggml-org/llama.cpp/pull/XXXX +// The issue was that state restore required contiguous KV cache slots, +// which fails when the cache is fragmented. +// +// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false) +// in state_read_meta(), allowing non-contiguous slot allocation. + +#include "arg.h" +#include "common.h" +#include "llama.h" + +#include +#include +#include + +int main(int argc, char ** argv) { + common_params params; + + params.prompt = "The quick brown fox"; + params.sampling.seed = 1234; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + return 1; + } + + // Need multiple sequences to create fragmentation + if (params.n_parallel < 3) { + params.n_parallel = 3; + } + + common_init(); + + if (params.n_predict < 0) { + params.n_predict = 8; + } + + // init + common_init_result llama_init = common_init_from_params(params); + + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + GGML_UNUSED(model); + + // tokenize prompt + auto tokens = common_tokenize(ctx, params.prompt, true); + + // Step 1: Process tokens on seq 0 + llama_batch batch = llama_batch_init(tokens.size(), 0, 1); + for (size_t i = 0; i < tokens.size(); i++) { + common_batch_add(batch, tokens[i], i, {0}, false); + } + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to decode seq 0\n", __func__); + return 1; + } + + // Step 2: Process tokens on seq 1 (to create fragmentation later) + common_batch_clear(batch); + for (size_t i = 0; i < tokens.size(); i++) { + common_batch_add(batch, tokens[i], i, {1}, false); + } + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to decode seq 1\n", __func__); + return 1; + } + + // Step 3: Process tokens on seq 2 (to create more fragmentation) + common_batch_clear(batch); + for (size_t i = 0; i < tokens.size(); i++) { + common_batch_add(batch, tokens[i], i, {2}, false); + } + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to decode seq 2\n", __func__); + return 1; + } + + fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size()); + + // Step 4: Save state of seq 0 + std::vector seq_state(llama_state_seq_get_size(ctx, 0)); + const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 0); + if (ncopy != seq_state.size()) { + fprintf(stderr, "%s : failed to save seq 0 state\n", __func__); + return 1; + } + fprintf(stderr, "%s : saved seq 0 state, %zu bytes\n", __func__, ncopy); + + // Step 5: Clear seq 1 to create a "hole" in the KV cache (fragmentation) + llama_memory_t mem = llama_get_memory(ctx); + llama_memory_seq_rm(mem, 1, -1, -1); + fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__); + + // Step 6: Clear seq 0 as well + llama_memory_seq_rm(mem, 0, -1, -1); + fprintf(stderr, "%s : cleared seq 0\n", __func__); + + // Now the cache has: + // - A hole where seq 0 was (at the beginning) + // - A hole where seq 1 was (in the middle) + // - seq 2 data (at the end) + // This creates fragmentation - there's no contiguous block large enough + // for the seq 0 state if we only look for contiguous slots + + // Step 7: Restore seq 0 state into seq 1 (should work with non-contiguous allocation) + // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1) + // Before the fix, this would fail with "failed to find available cells in kv cache" + const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1); + if (nset != seq_state.size()) { + fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n", + __func__, nset, seq_state.size()); + fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__); + llama_batch_free(batch); + return 1; + } + fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset); + + // Step 8: Verify we can decode with the restored state + // Generate one token to verify the restored state is usable + auto sparams = llama_sampler_chain_default_params(); + llama_sampler * smpl = llama_sampler_chain_init(sparams); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); + + auto next_token = llama_sampler_sample(smpl, ctx, -1); + auto next_token_str = common_token_to_piece(ctx, next_token); + + common_batch_clear(batch); + common_batch_add(batch, next_token, (int)tokens.size(), {1}, true); + + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to decode with restored state\n", __func__); + llama_sampler_free(smpl); + llama_batch_free(batch); + return 1; + } + + fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str()); + fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__); + + llama_sampler_free(smpl); + llama_batch_free(batch); + + return 0; +} \ No newline at end of file From f51af5740b0a3b446cc52398126d11807ee5f028 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Dec 2025 12:43:24 +0200 Subject: [PATCH 2/4] tests : update logic --- tests/CMakeLists.txt | 4 +- tests/test-state-restore-fragmented.cpp | 86 ++++++++----------------- 2 files changed, 28 insertions(+), 62 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ed97b1363f1..c3d9f9c324f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -225,9 +225,9 @@ llama_build_and_test(test-autorelease.cpp LABEL "model") # Test for state restore with fragmented KV cache # Requires a model, uses same args pattern as test-thread-safety if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") - llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -c 256 -np 3) + llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf) else() - llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -c 256 -np 3) + llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf) endif() if (NOT GGML_BACKEND_DL) diff --git a/tests/test-state-restore-fragmented.cpp b/tests/test-state-restore-fragmented.cpp index b7e0d8d2612..481b39d04c7 100644 --- a/tests/test-state-restore-fragmented.cpp +++ b/tests/test-state-restore-fragmented.cpp @@ -1,5 +1,5 @@ // Test for state restore with fragmented KV cache -// This tests the fix for: https://github.com/ggml-org/llama.cpp/pull/XXXX +// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527 // The issue was that state restore required contiguous KV cache slots, // which fails when the cache is fragmented. // @@ -17,29 +17,22 @@ int main(int argc, char ** argv) { common_params params; - params.prompt = "The quick brown fox"; params.sampling.seed = 1234; + params.kv_unified = true; + params.n_parallel = 3; + params.n_ctx = 256; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - // Need multiple sequences to create fragmentation - if (params.n_parallel < 3) { - params.n_parallel = 3; - } - common_init(); - if (params.n_predict < 0) { - params.n_predict = 8; - } - // init - common_init_result llama_init = common_init_from_params(params); + common_init_result_ptr llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + llama_model * model = llama_init->model(); + llama_context * ctx = llama_init->context(); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); @@ -49,12 +42,15 @@ int main(int argc, char ** argv) { GGML_UNUSED(model); // tokenize prompt - auto tokens = common_tokenize(ctx, params.prompt, true); + std::vector tokens(70, 1); - // Step 1: Process tokens on seq 0 - llama_batch batch = llama_batch_init(tokens.size(), 0, 1); + // interleave the 3 sequences: + // 01201230123... + llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1); for (size_t i = 0; i < tokens.size(); i++) { - common_batch_add(batch, tokens[i], i, {0}, false); + for (int s = 0; s < params.n_parallel; ++s) { + common_batch_add(batch, tokens[i], i, {s}, false); + } } batch.logits[batch.n_tokens - 1] = true; @@ -63,58 +59,28 @@ int main(int argc, char ** argv) { return 1; } - // Step 2: Process tokens on seq 1 (to create fragmentation later) - common_batch_clear(batch); - for (size_t i = 0; i < tokens.size(); i++) { - common_batch_add(batch, tokens[i], i, {1}, false); - } - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to decode seq 1\n", __func__); - return 1; - } - - // Step 3: Process tokens on seq 2 (to create more fragmentation) - common_batch_clear(batch); - for (size_t i = 0; i < tokens.size(); i++) { - common_batch_add(batch, tokens[i], i, {2}, false); - } - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to decode seq 2\n", __func__); - return 1; - } - fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size()); - // Step 4: Save state of seq 0 - std::vector seq_state(llama_state_seq_get_size(ctx, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 0); + // Save state of seq 1 + std::vector seq_state(llama_state_seq_get_size(ctx, 1)); + const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1); if (ncopy != seq_state.size()) { - fprintf(stderr, "%s : failed to save seq 0 state\n", __func__); + fprintf(stderr, "%s : failed to save seq 1 state\n", __func__); return 1; } - fprintf(stderr, "%s : saved seq 0 state, %zu bytes\n", __func__, ncopy); + fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy); - // Step 5: Clear seq 1 to create a "hole" in the KV cache (fragmentation) + // clear seq 1 to create a "hole" in the KV cache (fragmentation) + // 0.20.20.20.2.... llama_memory_t mem = llama_get_memory(ctx); llama_memory_seq_rm(mem, 1, -1, -1); fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__); - // Step 6: Clear seq 0 as well - llama_memory_seq_rm(mem, 0, -1, -1); - fprintf(stderr, "%s : cleared seq 0\n", __func__); - - // Now the cache has: - // - A hole where seq 0 was (at the beginning) - // - A hole where seq 1 was (in the middle) - // - seq 2 data (at the end) + // Now the cache has holes where seq 1 was // This creates fragmentation - there's no contiguous block large enough - // for the seq 0 state if we only look for contiguous slots + // for the seq 1 state if we only look for contiguous slots - // Step 7: Restore seq 0 state into seq 1 (should work with non-contiguous allocation) + // Restore seq 1 state into seq 1 (should work with non-contiguous allocation) // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1) // Before the fix, this would fail with "failed to find available cells in kv cache" const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1); @@ -127,7 +93,7 @@ int main(int argc, char ** argv) { } fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset); - // Step 8: Verify we can decode with the restored state + // Verify we can decode with the restored state // Generate one token to verify the restored state is usable auto sparams = llama_sampler_chain_default_params(); llama_sampler * smpl = llama_sampler_chain_init(sparams); @@ -153,4 +119,4 @@ int main(int argc, char ** argv) { llama_batch_free(batch); return 0; -} \ No newline at end of file +} From b7d52f0480c966cf21dbac55188618631ae69ebf Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Mon, 15 Dec 2025 07:14:02 -0800 Subject: [PATCH 3/4] cleanup: tightened state_read_meta sig, added is_contiguous case --- src/llama-kv-cache.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 977159530fd..1868f118572 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -78,6 +78,9 @@ class llama_kv_cache : public llama_memory_i { if (idxs.empty() || idxs[0].empty()) { return true; } + if (idxs.size() > 1) { + return false; + } const uint32_t h = idxs[0][0]; for (size_t i = 0; i < idxs[0].size(); ++i) { if (idxs[0][i] != h + i) { @@ -278,7 +281,7 @@ class llama_kv_cache : public llama_memory_i { void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const; void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const; - bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo); + bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id = -1); bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo); }; From d5bc2dfa20772b7f2555db10b11f7c2a58ea5c47 Mon Sep 17 00:00:00 2001 From: Scott Sweeney <1149151+ssweens@users.noreply.github.com> Date: Mon, 15 Dec 2025 08:41:39 -0800 Subject: [PATCH 4/4] fix: state_read_meta arg reorder loose ends --- src/llama-kv-cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 64fb73a7c5f..3186242d60f 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1564,7 +1564,7 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama slot_info sinfo; bool res = true; - res = res && state_read_meta(io, strm, cell_count, seq_id, sinfo); + res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id); res = res && state_read_data(io, strm, cell_count, sinfo); if (!res) { @@ -1704,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t } } -bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo) { +bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) { auto & cells = v_cells[strm]; auto & head = v_heads[strm];