From c62a495a02f2439ceef524c2e980190af95d23f3 Mon Sep 17 00:00:00 2001
From: Scott Sweeney <1149151+ssweens@users.noreply.github.com>
Date: Fri, 12 Dec 2025 00:39:33 -0800
Subject: [PATCH 1/4] kv-cache : fix state restore with fragmented cache
 (#17527)

Change find_slot to allow non-contiguous allocation during state restore. Fixes 'failed to find available cells in kv cache' error when restoring state to fragmented cache.
---
 src/llama-kv-cache.cpp                  |  91 ++++++++++----
 src/llama-kv-cache.h                    |  18 ++-
 tests/CMakeLists.txt                    |   8 ++
 tests/test-state-restore-fragmented.cpp | 156 ++++++++++++++++++++++++
 4 files changed, 244 insertions(+), 29 deletions(-)
 create mode 100644 tests/test-state-restore-fragmented.cpp

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 3e02bd62977..0d3a762fe01 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1549,9 +1549,11 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
 
         const uint32_t strm = seq_id == -1 ? s : seq_to_stream[seq_id];
 
+        slot_info sinfo;
+
         bool res = true;
-        res = res && state_read_meta(io, strm, cell_count, seq_id);
-        res = res && state_read_data(io, strm, cell_count);
+        res = res && state_read_meta(io, strm, cell_count, seq_id, sinfo);
+        res = res && state_read_data(io, strm, cell_count, sinfo);
 
         if (!res) {
             if (seq_id == -1) {
@@ -1690,7 +1692,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
     }
 }
 
-bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id) {
+bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo) {
     auto & cells = v_cells[strm];
     auto & head  = v_heads[strm];
 
@@ -1727,7 +1729,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
             ubatch.seq_id[i]   = &dest_seq_id;
         }
 
-        const auto sinfo = find_slot(ubatch, true);
+        sinfo = find_slot(ubatch, false);
         if (sinfo.empty()) {
             LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
             return false;
@@ -1737,20 +1739,16 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
         //       see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
         apply_ubatch(sinfo, ubatch);
 
-        const auto head_cur = sinfo.head();
-
-        // keep the head at the old position because we will read the KV data into it in state_read_data()
-        head = head_cur;
-
-        LLAMA_LOG_DEBUG("%s: head_cur = %d, head = %d, cell_count = %d, dest_seq_id = %d\n", __func__, head_cur, head, cell_count, dest_seq_id);
+        LLAMA_LOG_DEBUG("%s: cell_count = %d, dest_seq_id = %d\n", __func__, cell_count, dest_seq_id);
 
-        // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(head_cur + cell_count <= cells.size());
-        GGML_ASSERT(cells.pos_get(head_cur)                  == ubatch.pos[0]);
-        GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
-        GGML_ASSERT(cells.seq_has(head_cur,                  dest_seq_id));
-        GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
+        // DEBUG CHECK: verify that all cells were allocated and have correct seq_id and pos values
+        GGML_ASSERT(sinfo.n_stream() == 1);
+        GGML_ASSERT(sinfo.idxs[0].size() == cell_count);
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            const uint32_t idx = sinfo.idxs[0][i];
+            GGML_ASSERT(cells.pos_get(idx) == ubatch.pos[i]);
+            GGML_ASSERT(cells.seq_has(idx, dest_seq_id));
+        }
     } else {
         // whole KV cache restore
 
@@ -1783,15 +1781,24 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
             }
         }
 
+        // Create contiguous slot_info for whole cache restore
+        sinfo.s0 = strm;
+        sinfo.s1 = strm;
+        sinfo.resize(1);
+        sinfo.strm[0] = strm;
+        sinfo.idxs[0].resize(cell_count);
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            sinfo.idxs[0][i] = i;
+        }
+
         head = 0;
     }
 
     return true;
 }
 
-bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count) {
+bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo) {
     auto & cells = v_cells[strm];
-    auto & head  = v_heads[strm];
 
     uint32_t v_trans;
     uint32_t n_layer;
@@ -1841,8 +1848,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
         }
 
         if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+            if (sinfo.is_contiguous()) {
+                // Fast path: contiguous cells, single memcpy
+                ggml_backend_tensor_set(k, io.read(cell_count * k_size_row), sinfo.head() * k_size_row, cell_count * k_size_row);
+            } else {
+                // Slow path: scatter to non-contiguous positions
+                const void * src = io.read(cell_count * k_size_row);
+                for (uint32_t i = 0; i < cell_count; ++i) {
+                    const size_t dst_offset = sinfo.idxs[0][i] * k_size_row;
+                    ggml_backend_tensor_set(k, (const char*)src + i * k_size_row, dst_offset, k_size_row);
+                }
+            }
         }
     }
 
@@ -1873,8 +1889,17 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
             }
 
             if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+                if (sinfo.is_contiguous()) {
+                    // Fast path: contiguous cells, single memcpy
+                    ggml_backend_tensor_set(v, io.read(cell_count * v_size_row), sinfo.head() * v_size_row, cell_count * v_size_row);
+                } else {
+                    // Slow path: scatter to non-contiguous positions
+                    const void * src = io.read(cell_count * v_size_row);
+                    for (uint32_t i = 0; i < cell_count; ++i) {
+                        const size_t dst_offset = sinfo.idxs[0][i] * v_size_row;
+                        ggml_backend_tensor_set(v, (const char*)src + i * v_size_row, dst_offset, v_size_row);
+                    }
+                }
             }
         }
     } else {
@@ -1913,10 +1938,22 @@ bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t strm, uint32
             }
 
             if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (head + j * cells.size()) * v_size_el;
-                    ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                if (sinfo.is_contiguous()) {
+                    // Fast path: contiguous cells
+                    const uint32_t h = sinfo.head();
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        const size_t dst_offset = (h + j * cells.size()) * v_size_el;
+                        ggml_backend_tensor_set(v, io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                    }
+                } else {
+                    // Slow path: scatter to non-contiguous positions
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        const void * src = io.read(cell_count * v_size_el);
+                        for (uint32_t i = 0; i < cell_count; ++i) {
+                            const size_t dst_offset = (sinfo.idxs[0][i] + j * cells.size()) * v_size_el;
+                            ggml_backend_tensor_set(v, (const char*)src + i * v_size_el, dst_offset, v_size_el);
+                        }
+                    }
                 }
             }
         }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index bf7821c07ca..977159530fd 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -72,6 +72,20 @@ class llama_kv_cache : public llama_memory_i {
         void clear() {
             idxs.clear();
         }
+
+        // check if indices are contiguous starting from head()
+        bool is_contiguous() const {
+            if (idxs.empty() || idxs[0].empty()) {
+                return true;
+            }
+            const uint32_t h = idxs[0][0];
+            for (size_t i = 0; i < idxs[0].size(); ++i) {
+                if (idxs[0][i] != h + i) {
+                    return false;
+                }
+            }
+            return true;
+        }
     };
 
     using slot_info_vec_t = std::vector<slot_info>;
@@ -264,8 +278,8 @@ class llama_kv_cache : public llama_memory_i {
     void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
     void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
 
-    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
+    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo);
+    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
 };
 
 class llama_kv_cache_context : public llama_memory_context_i {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9ba559c8dfb..ed97b1363f1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -222,6 +222,14 @@ llama_build_and_test(test-backend-ops.cpp)
 llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_build_and_test(test-autorelease.cpp        LABEL "model")
 
+# Test for state restore with fragmented KV cache
+# Requires a model, uses same args pattern as test-thread-safety
+if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -c 256 -np 3)
+else()
+    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -c 256 -np 3)
+endif()
+
 if (NOT GGML_BACKEND_DL)
     # these tests use the backends directly and cannot be built with dynamic loading
     llama_build_and_test(test-barrier.cpp)
diff --git a/tests/test-state-restore-fragmented.cpp b/tests/test-state-restore-fragmented.cpp
new file mode 100644
index 00000000000..b7e0d8d2612
--- /dev/null
+++ b/tests/test-state-restore-fragmented.cpp
@@ -0,0 +1,156 @@
+// Test for state restore with fragmented KV cache
+// This tests the fix for: https://github.com/ggml-org/llama.cpp/pull/XXXX
+// The issue was that state restore required contiguous KV cache slots,
+// which fails when the cache is fragmented.
+//
+// The fix changes find_slot(ubatch, true) to find_slot(ubatch, false)
+// in state_read_meta(), allowing non-contiguous slot allocation.
+
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+
+#include <vector>
+#include <cstdio>
+#include <cstring>
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.prompt = "The quick brown fox";
+    params.sampling.seed = 1234;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    // Need multiple sequences to create fragmentation
+    if (params.n_parallel < 3) {
+        params.n_parallel = 3;
+    }
+
+    common_init();
+
+    if (params.n_predict < 0) {
+        params.n_predict = 8;
+    }
+
+    // init
+    common_init_result llama_init = common_init_from_params(params);
+
+    llama_model * model = llama_init.model.get();
+    llama_context * ctx = llama_init.context.get();
+
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    GGML_UNUSED(model);
+
+    // tokenize prompt
+    auto tokens = common_tokenize(ctx, params.prompt, true);
+
+    // Step 1: Process tokens on seq 0
+    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        common_batch_add(batch, tokens[i], i, {0}, false);
+    }
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode seq 0\n", __func__);
+        return 1;
+    }
+
+    // Step 2: Process tokens on seq 1 (to create fragmentation later)
+    common_batch_clear(batch);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        common_batch_add(batch, tokens[i], i, {1}, false);
+    }
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode seq 1\n", __func__);
+        return 1;
+    }
+
+    // Step 3: Process tokens on seq 2 (to create more fragmentation)
+    common_batch_clear(batch);
+    for (size_t i = 0; i < tokens.size(); i++) {
+        common_batch_add(batch, tokens[i], i, {2}, false);
+    }
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode seq 2\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size());
+
+    // Step 4: Save state of seq 0
+    std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 0));
+    const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 0);
+    if (ncopy != seq_state.size()) {
+        fprintf(stderr, "%s : failed to save seq 0 state\n", __func__);
+        return 1;
+    }
+    fprintf(stderr, "%s : saved seq 0 state, %zu bytes\n", __func__, ncopy);
+
+    // Step 5: Clear seq 1 to create a "hole" in the KV cache (fragmentation)
+    llama_memory_t mem = llama_get_memory(ctx);
+    llama_memory_seq_rm(mem, 1, -1, -1);
+    fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__);
+
+    // Step 6: Clear seq 0 as well
+    llama_memory_seq_rm(mem, 0, -1, -1);
+    fprintf(stderr, "%s : cleared seq 0\n", __func__);
+
+    // Now the cache has:
+    // - A hole where seq 0 was (at the beginning)
+    // - A hole where seq 1 was (in the middle)
+    // - seq 2 data (at the end)
+    // This creates fragmentation - there's no contiguous block large enough
+    // for the seq 0 state if we only look for contiguous slots
+
+    // Step 7: Restore seq 0 state into seq 1 (should work with non-contiguous allocation)
+    // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1)
+    // Before the fix, this would fail with "failed to find available cells in kv cache"
+    const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1);
+    if (nset != seq_state.size()) {
+        fprintf(stderr, "%s : FAILED to restore seq state into fragmented cache (got %zu, expected %zu)\n",
+                __func__, nset, seq_state.size());
+        fprintf(stderr, "%s : This is the bug - state restore fails with fragmented KV cache\n", __func__);
+        llama_batch_free(batch);
+        return 1;
+    }
+    fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset);
+
+    // Step 8: Verify we can decode with the restored state
+    // Generate one token to verify the restored state is usable
+    auto sparams = llama_sampler_chain_default_params();
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
+
+    auto next_token = llama_sampler_sample(smpl, ctx, -1);
+    auto next_token_str = common_token_to_piece(ctx, next_token);
+
+    common_batch_clear(batch);
+    common_batch_add(batch, next_token, (int)tokens.size(), {1}, true);
+
+    if (llama_decode(ctx, batch)) {
+        fprintf(stderr, "%s : failed to decode with restored state\n", __func__);
+        llama_sampler_free(smpl);
+        llama_batch_free(batch);
+        return 1;
+    }
+
+    fprintf(stderr, "%s : successfully decoded with restored state, generated: '%s'\n", __func__, next_token_str.c_str());
+    fprintf(stderr, "%s : SUCCESS - state restore works with fragmented KV cache\n", __func__);
+
+    llama_sampler_free(smpl);
+    llama_batch_free(batch);
+
+    return 0;
+}
\ No newline at end of file

From f51af5740b0a3b446cc52398126d11807ee5f028 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 15 Dec 2025 12:43:24 +0200
Subject: [PATCH 2/4] tests : update logic

---
 tests/CMakeLists.txt                    |  4 +-
 tests/test-state-restore-fragmented.cpp | 86 ++++++++-----------------
 2 files changed, 28 insertions(+), 62 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ed97b1363f1..c3d9f9c324f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -225,9 +225,9 @@ llama_build_and_test(test-autorelease.cpp        LABEL "model")
 # Test for state restore with fragmented KV cache
 # Requires a model, uses same args pattern as test-thread-safety
 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -c 256 -np 3)
+    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf)
 else()
-    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf -ngl 99 -c 256 -np 3)
+    llama_build_and_test(test-state-restore-fragmented.cpp LABEL "model" ARGS -hf ggml-org/models -hff tinyllamas/stories15M-be.Q4_0.gguf)
 endif()
 
 if (NOT GGML_BACKEND_DL)
diff --git a/tests/test-state-restore-fragmented.cpp b/tests/test-state-restore-fragmented.cpp
index b7e0d8d2612..481b39d04c7 100644
--- a/tests/test-state-restore-fragmented.cpp
+++ b/tests/test-state-restore-fragmented.cpp
@@ -1,5 +1,5 @@
 // Test for state restore with fragmented KV cache
-// This tests the fix for: https://github.com/ggml-org/llama.cpp/pull/XXXX
+// This tests the fix for: https://github.com/ggml-org/llama.cpp/issues/17527
 // The issue was that state restore required contiguous KV cache slots,
 // which fails when the cache is fragmented.
 //
@@ -17,29 +17,22 @@
 int main(int argc, char ** argv) {
     common_params params;
 
-    params.prompt = "The quick brown fox";
     params.sampling.seed = 1234;
+    params.kv_unified = true;
+    params.n_parallel = 3;
+    params.n_ctx = 256;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
-    // Need multiple sequences to create fragmentation
-    if (params.n_parallel < 3) {
-        params.n_parallel = 3;
-    }
-
     common_init();
 
-    if (params.n_predict < 0) {
-        params.n_predict = 8;
-    }
-
     // init
-    common_init_result llama_init = common_init_from_params(params);
+    common_init_result_ptr llama_init = common_init_from_params(params);
 
-    llama_model * model = llama_init.model.get();
-    llama_context * ctx = llama_init.context.get();
+    llama_model * model = llama_init->model();
+    llama_context * ctx = llama_init->context();
 
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
@@ -49,12 +42,15 @@ int main(int argc, char ** argv) {
     GGML_UNUSED(model);
 
     // tokenize prompt
-    auto tokens = common_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> tokens(70, 1);
 
-    // Step 1: Process tokens on seq 0
-    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    // interleave the 3 sequences:
+    // 01201230123...
+    llama_batch batch = llama_batch_init(params.n_parallel*tokens.size(), 0, 1);
     for (size_t i = 0; i < tokens.size(); i++) {
-        common_batch_add(batch, tokens[i], i, {0}, false);
+        for (int s = 0; s < params.n_parallel; ++s) {
+            common_batch_add(batch, tokens[i], i, {s}, false);
+        }
     }
     batch.logits[batch.n_tokens - 1] = true;
 
@@ -63,58 +59,28 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // Step 2: Process tokens on seq 1 (to create fragmentation later)
-    common_batch_clear(batch);
-    for (size_t i = 0; i < tokens.size(); i++) {
-        common_batch_add(batch, tokens[i], i, {1}, false);
-    }
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_decode(ctx, batch)) {
-        fprintf(stderr, "%s : failed to decode seq 1\n", __func__);
-        return 1;
-    }
-
-    // Step 3: Process tokens on seq 2 (to create more fragmentation)
-    common_batch_clear(batch);
-    for (size_t i = 0; i < tokens.size(); i++) {
-        common_batch_add(batch, tokens[i], i, {2}, false);
-    }
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_decode(ctx, batch)) {
-        fprintf(stderr, "%s : failed to decode seq 2\n", __func__);
-        return 1;
-    }
-
     fprintf(stderr, "%s : processed prompt on seq 0, 1, 2 (%zu tokens each)\n", __func__, tokens.size());
 
-    // Step 4: Save state of seq 0
-    std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 0));
-    const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 0);
+    // Save state of seq 1
+    std::vector<uint8_t> seq_state(llama_state_seq_get_size(ctx, 1));
+    const size_t ncopy = llama_state_seq_get_data(ctx, seq_state.data(), seq_state.size(), 1);
     if (ncopy != seq_state.size()) {
-        fprintf(stderr, "%s : failed to save seq 0 state\n", __func__);
+        fprintf(stderr, "%s : failed to save seq 1 state\n", __func__);
         return 1;
     }
-    fprintf(stderr, "%s : saved seq 0 state, %zu bytes\n", __func__, ncopy);
+    fprintf(stderr, "%s : saved seq 1 state, %zu bytes\n", __func__, ncopy);
 
-    // Step 5: Clear seq 1 to create a "hole" in the KV cache (fragmentation)
+    // clear seq 1 to create a "hole" in the KV cache (fragmentation)
+    // 0.20.20.20.2....
     llama_memory_t mem = llama_get_memory(ctx);
     llama_memory_seq_rm(mem, 1, -1, -1);
     fprintf(stderr, "%s : cleared seq 1 to create fragmentation\n", __func__);
 
-    // Step 6: Clear seq 0 as well
-    llama_memory_seq_rm(mem, 0, -1, -1);
-    fprintf(stderr, "%s : cleared seq 0\n", __func__);
-
-    // Now the cache has:
-    // - A hole where seq 0 was (at the beginning)
-    // - A hole where seq 1 was (in the middle)
-    // - seq 2 data (at the end)
+    // Now the cache has holes where seq 1 was
     // This creates fragmentation - there's no contiguous block large enough
-    // for the seq 0 state if we only look for contiguous slots
+    // for the seq 1 state if we only look for contiguous slots
 
-    // Step 7: Restore seq 0 state into seq 1 (should work with non-contiguous allocation)
+    // Restore seq 1 state into seq 1 (should work with non-contiguous allocation)
     // We use seq 1 since it's a valid sequence ID (0 to n_parallel-1)
     // Before the fix, this would fail with "failed to find available cells in kv cache"
     const size_t nset = llama_state_seq_set_data(ctx, seq_state.data(), seq_state.size(), 1);
@@ -127,7 +93,7 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "%s : restored state into seq 1, %zu bytes\n", __func__, nset);
 
-    // Step 8: Verify we can decode with the restored state
+    // Verify we can decode with the restored state
     // Generate one token to verify the restored state is usable
     auto sparams = llama_sampler_chain_default_params();
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
@@ -153,4 +119,4 @@ int main(int argc, char ** argv) {
     llama_batch_free(batch);
 
     return 0;
-}
\ No newline at end of file
+}

From b7d52f0480c966cf21dbac55188618631ae69ebf Mon Sep 17 00:00:00 2001
From: Scott Sweeney <1149151+ssweens@users.noreply.github.com>
Date: Mon, 15 Dec 2025 07:14:02 -0800
Subject: [PATCH 3/4] cleanup: tightened state_read_meta sig, added
 is_contiguous case

---
 src/llama-kv-cache.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 977159530fd..1868f118572 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -78,6 +78,9 @@ class llama_kv_cache : public llama_memory_i {
             if (idxs.empty() || idxs[0].empty()) {
                 return true;
             }
+            if (idxs.size() > 1) {
+                return false;
+            }
             const uint32_t h = idxs[0][0];
             for (size_t i = 0; i < idxs[0].size(); ++i) {
                 if (idxs[0][i] != h + i) {
@@ -278,7 +281,7 @@ class llama_kv_cache : public llama_memory_i {
     void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
     void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
 
-    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo);
+    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count,       slot_info & sinfo, llama_seq_id dest_seq_id = -1);
     bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
 };
 

From d5bc2dfa20772b7f2555db10b11f7c2a58ea5c47 Mon Sep 17 00:00:00 2001
From: Scott Sweeney <1149151+ssweens@users.noreply.github.com>
Date: Mon, 15 Dec 2025 08:41:39 -0800
Subject: [PATCH 4/4] fix: state_read_meta arg reorder loose ends

---
 src/llama-kv-cache.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 64fb73a7c5f..3186242d60f 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1564,7 +1564,7 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
         slot_info sinfo;
 
         bool res = true;
-        res = res && state_read_meta(io, strm, cell_count, seq_id, sinfo);
+        res = res && state_read_meta(io, strm, cell_count, sinfo, seq_id);
         res = res && state_read_data(io, strm, cell_count, sinfo);
 
         if (!res) {
@@ -1704,7 +1704,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
     }
 }
 
-bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id, slot_info & sinfo) {
+bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, slot_info & sinfo, llama_seq_id dest_seq_id) {
     auto & cells = v_cells[strm];
     auto & head  = v_heads[strm];