ik_llama.cpp/examples/server/server.cpp at eaf466d84856504b0d4afa8900cec4e571acf7d8 · sayap/ik_llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#pragma warning(disable : 4996)
#include "chat.h"
#include "utils.hpp"

#include "common.h"
#include "speculative.h"
#include "mtmd.h"
#include "sampling.h"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include "grammar-parser.h"
#include "llama-vocab.h"

#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
#define CPPHTTPLIB_NO_EXCEPTIONS 1
#endif

#include <nlohmann/json.hpp>
#include "index.html.gz.hpp"
#include "index_llamacpp.html.gz.hpp"
#include "loading.html.hpp"

#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstddef>
#include <set>
#include <mutex>
#include <thread>
#include <signal.h>
#include <memory>
#include <random>
#include <algorithm>
#include <src/llama-impl.h>
#ifdef SQLITE3_MODERN_CPP_SUPPORT
#include <sqlite_modern_cpp.h>

struct DatabaseHandle {
    sqlite::database db;

    DatabaseHandle(const std::string& path) : db(path) {
        db << "CREATE TABLE IF NOT EXISTS sessions (key TEXT PRIMARY KEY, data TEXT)";
        db << "CREATE TABLE IF NOT EXISTS templates (key TEXT PRIMARY KEY, data TEXT)";
        db << "CREATE TABLE IF NOT EXISTS names (key TEXT PRIMARY KEY, data TEXT)";
    }
};
#endif

using json = nlohmann::ordered_json;

bool server_verbose = false;
bool server_log_json = true;


enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
    STOP_TYPE_WORD,
    STOP_TYPE_LIMIT,
};
enum slot_state {
    SLOT_STATE_IDLE,
    SLOT_STATE_PROCESSING,
};

enum slot_command {
    SLOT_COMMAND_NONE,
    SLOT_COMMAND_LOAD_PROMPT,
    SLOT_COMMAND_RELEASE,
};

enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
};

enum server_task_type {
    SERVER_TASK_TYPE_COMPLETION,
    SERVER_TASK_TYPE_EMBEDDING,
    SERVER_TASK_TYPE_RERANK,
    SERVER_TASK_TYPE_INFILL,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
    SERVER_TASK_TYPE_METRICS,
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
};

enum oaicompat_type {
    OAICOMPAT_TYPE_NONE,
    OAICOMPAT_TYPE_CHAT,
    OAICOMPAT_TYPE_COMPLETION,
    OAICOMPAT_TYPE_EMBEDDING,
};

struct result_timings {
    int32_t prompt_n = -1;
    double prompt_ms;
    double prompt_per_token_ms;
    double prompt_per_second;

    int32_t predicted_n = -1;
    double predicted_ms;
    double predicted_per_token_ms;
    double predicted_per_second;

    // Optional speculative metrics - only included when > 0
    int32_t draft_n = 0;
    int32_t draft_n_accepted = 0;

    json to_json() const {
        json base = {
            {"prompt_n",               prompt_n},
            {"prompt_ms",              prompt_ms},
            {"prompt_per_token_ms",    prompt_per_token_ms},
            {"prompt_per_second",      prompt_per_second},

            {"predicted_n",            predicted_n},
            {"predicted_ms",           predicted_ms},
            {"predicted_per_token_ms", predicted_per_token_ms},
            {"predicted_per_second",   predicted_per_second},
        };

        if (draft_n > 0) {
            base["draft_n"] = draft_n;
            base["draft_n_accepted"] = draft_n_accepted;
        }

        return base;
    }
};

struct server_task {
    int id        = -1; // to be filled by server_queue
    int id_multi  = -1;
    int id_target = -1;
    //int id_slot = -1;

    // used by SERVER_TASK_TYPE_INFERENCE
    server_tokens tokens;

    server_task_type type;
    json data;

    bool infill    = false;
    bool embedding = false;

    server_task() = default;
    server_task(server_task_type type) : type(type) {}

};

struct server_task_result {
    int id       = -1;
    int id_multi = -1;

    json data;

    bool stop;
    bool error;
    bool final_result = false;
    result_timings timings;
    // OAI-compat fields
    //bool                  verbose = false;
    oaicompat_type        oaicompat = OAICOMPAT_TYPE_NONE;
    std::string           oaicompat_model;
    std::string           oaicompat_cmpl_id;
    common_chat_format    oaicompat_chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
    common_chat_msg    oaicompat_msg;
    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;

    int index = 0;

    std::string content;
    std::vector<llama_token> tokens;

    bool stream;
    bool include_usage;
    std::string prompt;
    //slot_params generation_params;

    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
    int32_t n_tokens_cached;
    bool has_new_line;
    std::string stopping_word;

    bool post_sampling_probs = false;
    std::vector<completion_token_output> probs_output;
    std::vector<std::string>  response_fields;

    //slot_params generation_params;

    bool                  verbose = false;


    int get_index() {
        return index;
    }

    bool is_stop() {
        return true; // in stream mode, final responses are considered stop
    }

    json to_json_final() {
        switch (oaicompat) {
        case OAICOMPAT_TYPE_NONE:
            return to_json_non_oaicompat_final();
        case OAICOMPAT_TYPE_COMPLETION:
            return to_json_oaicompat_final();
        case OAICOMPAT_TYPE_CHAT:
            return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat_final();
        default:
            GGML_ASSERT(false && "Invalid oaicompat_type");
        }
    }

    json to_json_partial() {
        switch (oaicompat) {
        case OAICOMPAT_TYPE_NONE:
            return to_json_non_oaicompat_partial();
        case OAICOMPAT_TYPE_COMPLETION:
            return to_json_oaicompat_partial();
        case OAICOMPAT_TYPE_CHAT:
            return  to_json_oaicompat_chat_partial();
        default:
            GGML_ASSERT(false && "Invalid oaicompat_type");
        }
    }

    json to_json_non_oaicompat_partial() {
        // non-OAI-compat JSON
        json res = json{
            {"index",            index},
            {"content",          content},
            {"tokens",           tokens},
            {"stop",             false},
            {"id_slot",          id_multi},
            {"tokens_predicted", n_decoded},
            {"tokens_evaluated", n_prompt_tokens},
        };
        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
        if (timings.prompt_n > 0) {
            res.push_back({ "timings", timings.to_json() });
        }
        if (!probs_output.empty()) {
            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
        }
        return res;
    }

    json to_json_non_oaicompat_final() {
        json res = json{
            {"index",               index},
            {"content",             stream ? "" : content}, // in stream mode, content is already in last partial chunk
            {"tokens",              stream ? std::vector<llama_token> {} : tokens},
            {"id_slot",             id_multi},
            {"stop",                true},
            {"model",               oaicompat_model},
            {"tokens_predicted",    n_decoded},
            {"tokens_evaluated",    n_prompt_tokens},
            //{"generation_settings", default_generation_settings_for_props.to_json()},
            {"prompt",              prompt},
            {"has_new_line",        has_new_line},
            {"truncated",           truncated},
            //{"stop_type",           stop_type_to_str(STOP_TYPE_EOS)},
            {"stopping_word",       stopping_word},
            {"tokens_cached",       n_tokens_cached},
            {"timings",             timings.to_json()},
};
        if (!stream && !probs_output.empty()) {
            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
        }
        return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
    }

    json to_json_oaicompat_partial() {
        std::time_t t = std::time(0);
        json logprobs = json(nullptr); // OAI default to null
        if (probs_output.size() > 0) {
            logprobs = json{
                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
            };
        }
        json res = json{
            {"choices",            json::array({
                json{
                    {"text",          content},
                    {"index",         index},
                    {"logprobs",      logprobs},
                    {"finish_reason", nullptr},
                }
            })},
            {"created",            t},
            {"model",              oaicompat_model},
            {"object",             "text_completion"},
            {"usage", json {
	            {"completion_tokens", n_decoded},
	            {"prompt_tokens",     n_prompt_tokens},
	            {"total_tokens",      n_decoded + n_prompt_tokens}
            }},
            {"id",                 oaicompat_cmpl_id}
        };

        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json_non_oaicompat_partial();
        }
        if (timings.prompt_n >= 0) {
            res.push_back({ "timings", timings.to_json() });
        }

        return res;
    }

    json to_json_oaicompat_final() {
        std::time_t t = std::time(0);
        json logprobs = json(nullptr); // OAI default to null
        if (!stream && probs_output.size() > 0) {
            logprobs = json{
                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
            };
        }
        json finish_reason = "length";
        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = "stop";
        }
        json res = json{
            {"choices",            json::array({
                json{
                    {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk
                    {"index",         index},
                    {"logprobs",      logprobs},
                    {"finish_reason", finish_reason},
                }
            })},
            {"created",            t},
            {"model",              oaicompat_model},
            {"object",             "text_completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens}
            }},
            {"id", oaicompat_cmpl_id}
        };

        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json_non_oaicompat_final();
        }
        if (timings.prompt_n >= 0) {
            res.push_back({ "timings", timings.to_json() });
        }

        return res;
    }

    json to_json_oaicompat_chat_partial() {
        bool first = n_decoded == 1;
        std::time_t t = std::time(0);
        json choices;

        std::vector<json> deltas;
        auto add_delta = [&](const json& delta) {
            deltas.push_back({
                {"choices", json::array({
                    json {
                        {"finish_reason", nullptr},
                        {"index", 0},
                        {"delta", delta},
                    },
                })},
                {"created", t},
                {"id", oaicompat_cmpl_id},
                {"model", oaicompat_model},
                {"object", "chat.completion.chunk"},
                {"usage", json {
                    {"completion_tokens", n_decoded},
                    {"prompt_tokens",     n_prompt_tokens},
                    {"total_tokens",      n_decoded + n_prompt_tokens},
                }},
                });
        };
        // We have to send an initial update to conform to openai behavior
        if (first) {
            add_delta({
                {"role", "assistant"},
                {"content", nullptr},
                });
        }

        for (const auto& diff : oaicompat_msg_diffs) {
            add_delta(common_chat_msg_diff_to_json_oaicompat<json>(diff));
        }

        if (!deltas.empty()) {
            GGML_ASSERT(deltas[deltas.size() - 1].at("choices").size() >= 1);

            if (probs_output.size() > 0) {
                deltas[deltas.size() - 1].at("choices").at(0)["logprobs"] = json{
                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
                };
            }

            if (timings.prompt_n >= 0) {
                deltas[deltas.size() - 1].push_back({ "timings", timings.to_json() });
            }
        }

        return deltas;
    }

    json to_json_oaicompat_chat_final() {
        std::string finish_reason = "length";
        common_chat_msg msg;
        if (!oaicompat_msg.empty()) {
            msg = oaicompat_msg;
        }
        else {
            msg.role = "assistant";
            msg.content = content;
        }
        if (stop) {
            finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
        }


        json choice{
            {"finish_reason", finish_reason},
            {"index", 0},
            {"message", msg.to_json_oaicompat<json>()},
        };

        if (!stream && probs_output.size() > 0) {
            choice["logprobs"] = json{
                {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
            };
        }

        std::time_t t = std::time(0);

        json res = json{
            {"choices",            json::array({choice})},
            {"created",            t},
            {"model",              oaicompat_model},
            {"object",             "chat.completion"},
            {"usage", json {
                {"completion_tokens", n_decoded},
                {"prompt_tokens",     n_prompt_tokens},
                {"total_tokens",      n_decoded + n_prompt_tokens}
            }},
            {"id", oaicompat_cmpl_id}
        };

        // extra fields for debugging purposes
        if (verbose) {
            res["__verbose"] = to_json_non_oaicompat_final();
        }
        if (timings.prompt_n >= 0) {
            res.push_back({ "timings", timings.to_json() });
        }

        return res;
    }

    json to_json_oaicompat_chat_stream() {
        std::time_t t = std::time(0);
        std::string finish_reason = "length";
        if (stop) {
        //if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
            finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
        }

        json deltas = json::array();
        for (const auto& diff : oaicompat_msg_diffs) {
            deltas.push_back({
                {"choices", json::array({
                    json {
                        {"finish_reason", nullptr},
                        {"index", 0},
                        {"delta", common_chat_msg_diff_to_json_oaicompat<json>(diff)},
                    },
                })},
                {"created", t},
                {"id", oaicompat_cmpl_id},
                {"model", oaicompat_model},
                {"object", "chat.completion.chunk"},
                });
        }

        deltas.push_back({
            {"choices", json::array({
                json {
                    {"finish_reason", finish_reason},
                    {"index", 0},
                    {"delta", json::object()},
                },
            })},
            {"created",            t},
            {"id",                 oaicompat_cmpl_id},
            {"model",              oaicompat_model},
            {"object",             "chat.completion.chunk"},
         });
        if (include_usage) {
            // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
            // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
            deltas.push_back({
                {"choices", json::array()},
                {"created",            t},
                {"id",                 oaicompat_cmpl_id},
                {"model",              oaicompat_model},
                {"object",             "chat.completion.chunk"},
                {"usage", json {
                    {"completion_tokens", n_decoded},
                    {"prompt_tokens",     n_prompt_tokens},
                    {"total_tokens",      n_decoded + n_prompt_tokens},
                }},
                });
        }
        if (timings.prompt_n >= 0) {
            deltas.back().push_back({ "timings", timings.to_json() });
        }
        // extra fields for debugging purposes
        if (verbose && !deltas.empty()) {
            deltas.front()["__verbose"] = to_json_non_oaicompat_final();
        }

        return deltas;
    }
};

static inline std::string stop_type_to_str(stop_type type) {
    switch (type) {
    case STOP_TYPE_EOS:   return "eos";
    case STOP_TYPE_WORD:  return "word";
    case STOP_TYPE_LIMIT: return "limit";
    default:              return "none";
    }
}


struct server_task_multi {
    int id = -1;

    std::set<int> subtasks_remaining;
    std::vector<server_task_result> results;
};

struct slot_params {
    bool stream       = true;
    bool include_usage = false;
    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt

    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t  n_predict = -1; // new tokens to predict

    std::vector<std::string> antiprompt;

    bool timings_per_token = false;
    bool post_sampling_probs = false;
    json input_prefix;
    json input_suffix;

    // speculative decoding parameters
    struct {
        int n_max = 16;  // max drafted tokens
        int n_min = 0;  // min drafted tokens to accept
        float p_min = 0.75f; // min probability required to accept a token in the draft
    } speculative;

    // OAI-compat fields
    oaicompat_type        oaicompat = OAICOMPAT_TYPE_NONE;
    std::string           oaicompat_model;
    std::string           oaicompat_cmpl_id;
    common_chat_syntax           oaicompat_chat_syntax;

};


struct server_prompt_checkpoint {
    llama_pos pos_min;
    llama_pos pos_max;

    std::vector<uint8_t> data;

    size_t size() const {
        return data.size();
    }
};

struct server_prompt {
    server_tokens tokens;
    int n_keep;
    int n_discarded;

    std::vector<uint8_t> data;

    std::list<server_prompt_checkpoint> checkpoints;

    size_t size() const {
        size_t res = data.size();

        for (const auto& checkpoint : checkpoints) {
            res += checkpoint.size();
        }

        return res;
    }

    int n_tokens() const {
        return tokens.size();
    }
};

struct server_prompt_cache {
    server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) {
        this->limit_size = 1024ull * 1024ull * (limit_size_mib < 0 ? 0 : limit_size_mib);
        this->limit_tokens = limit_tokens;
    }

    std::list<server_prompt> states;

    // in bytes, 0 = no limit
    size_t limit_size = 0;

    // in tokens, 0 = no limit
    size_t limit_tokens = 0;

    size_t size() const {
        size_t res = 0;

        for (const auto& state : states) {
            res += state.size();
        }

        return res;
    }

    size_t n_tokens() const {
        size_t res = 0;

        for (const auto& state : states) {
            res += state.n_tokens();
        }

        return res;
    }

    server_prompt* alloc(const server_prompt& prompt, size_t state_size) {
        for (auto it = states.begin(); it != states.end();) {
            auto tokens_ctx_shift = server_tokens(prompt.tokens.get_text_tokens(), false); // copy cache tokens
            tokens_ctx_shift.discard_n_tokens(prompt.n_keep, prompt.n_discarded);

            const size_t len = it->tokens.get_common_prefix(tokens_ctx_shift);
            // first check if the current state is contained fully in the cache
            if (len == tokens_ctx_shift.size()) {
                LLAMA_LOG_INFO("%s", " - prompt is already in the cache, skipping\n");
                return nullptr;
            }
            // next, remove any cached prompts that are fully contained in the current prompt
            else if(len == it->tokens.size()) {
                LLAMA_LOG_INFO(" - removing obsolete cached prompt with length %d\n", (int)len);
                it = states.erase(it);
            }
            else {
                ++it;
            }
        }

        std::vector<uint8_t> state_data;

        // check if we can allocate enough memory for the new state
        try {
            state_data.resize(state_size);
        }
        catch (const std::bad_alloc& e) {
            LLAMA_LOG_INFO("failed to allocate memory for prompt cache state: %s\n", e.what());

            limit_size = std::max<size_t>(1, 0.4 * size());

            LLAMA_LOG_INFO(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));

            update();

            return nullptr;
        }

        // TODO: for some reason we can't copy server_tokens, so we have to do this workaround
        auto& cur = states.emplace_back();
        cur = {
            /*.tokens          =*/ server_tokens(prompt.tokens.get_text_tokens(), false),
            /*.n_keep          =*/ prompt.n_keep,
            /*.n_discarded     =*/ prompt.n_discarded,
            /*.data            =*/ std::move(state_data),
            /*.checkpoints     =*/ prompt.checkpoints,
        };

        return &cur;
    }

    bool load(server_prompt& prompt, const server_tokens& tokens_new, llama_context* ctx, int32_t id_slot) {
        const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);

        float f_keep_best = float(lcp_best) / prompt.tokens.size();
        float sim_best = prompt.tokens.get_tokens_similarity(tokens_new, prompt.n_keep, prompt.n_discarded);
        LLAMA_LOG_INFO(" - looking for better prompt, base f_keep = %.3f, sim = %.3f, n_keep = %d, n_discarded = %d\n", f_keep_best, sim_best, prompt.n_keep, prompt.n_discarded);

        auto it_best = states.end();

        // find the most similar cached prompt, that would also preserve the most context
        for (auto it = states.begin(); it != states.end(); ++it) {
            const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
            const float f_keep_cur = float(lcp_cur) / it->tokens.size();
            const float sim_cur = it->tokens.get_tokens_similarity(tokens_new, it->n_keep, it->n_discarded);
            if (sim_best < sim_cur) {
                f_keep_best = f_keep_cur;
                sim_best = sim_cur;
                it_best = it;
            }
        }

        if (it_best != states.end()) {
            LLAMA_LOG_INFO(" - found better prompt with f_keep = %.3f, sim = %.3f, n_keep = %d, n_discarded = %d\n", f_keep_best, sim_best, it_best->n_keep, it_best->n_discarded);
            const size_t size = it_best->data.size();
            const size_t n = llama_state_seq_set_data(ctx, it_best->data.data(), size, id_slot);
            if (n != size) {
                LLAMA_LOG_INFO("failed to restore state with size %zu\n", size);
                return false;
            }

            it_best->data.clear();
            it_best->data.shrink_to_fit();

            prompt = std::move(*it_best);

            states.erase(it_best);
        }

        return true;
    }

    void update() {
        if (limit_size > 0) {
            // always keep at least one state, regardless of the limits
            while (states.size() > 1 && size() > limit_size) {
                if (states.empty()) {
                    break;
                }

                LLAMA_LOG_INFO(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));

                states.pop_front();
            }
        }

        // average size per token
        const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));

        // dynamically increase the token limit if it can fit in the memory limit
        const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size / size_per_token) : limit_tokens;

        //if (limit_tokens > 0) {
        //
        //    while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
        //        if (states.empty()) {
        //            break;
        //        }

        //        LLAMA_LOG_INFO(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
        //            limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));

        //        states.pop_front();
        //    }
        //}

        LLAMA_LOG_INFO(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
            states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);

        for (const auto& state : states) {
            LLAMA_LOG_INFO("   - prompt %p: %7d tokens, %7d discarded, checkpoints: %2zu, %9.3f MiB\n",
                (const void*)&state, state.n_tokens(), state.n_discarded, state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
        }
    }
};


struct server_slot {
    int id;
    int id_task = -1;
    int id_multi = -1;

    struct slot_params params;

    slot_state state = SLOT_STATE_IDLE;
    slot_command command = SLOT_COMMAND_NONE;

    llama_context* ctx = nullptr;
    // used to determine the slot that has been used the longest
    int64_t t_last_used = -1;

    std::unique_ptr<const server_task> task;

    // generation props
    int32_t n_ctx       = 0;  // context size per slot
    int32_t n_past      = 0;
    int32_t n_decoded   = 0;
    int32_t n_remaining = -1;
    int32_t n_discarded =  0;
    int32_t n_kept      =  0;
    int32_t i_batch     = -1;
    int32_t n_predict   = -1; // TODO: disambiguate from params.n_predict

    int32_t n_prompt_tokens           = 0;
    int32_t n_prompt_tokens_processed = 0;

    json prompt; // can be either a string, array of strings or array of token ids

    // when a task is submitted, we first tokenize the prompt and store it here
    server_tokens prompt_tokens;
    server_tokens cache_tokens;

    std::string generated_text;

    std::vector<completion_token_output> generated_token_probs;
    common_chat_msg chat_msg;

    bool infill         = false;
    bool embedding      = false;
    bool has_next_token = true;
    bool truncated      = false;
    bool stopped_eos    = false;
    bool stopped_word   = false;
    bool stopped_limit  = false;

    bool oaicompat = false;

    std::string oaicompat_model;
    std::string stopping_word;
    stop_type stop;

    server_prompt server_cached_prompt;

    void prompt_save(server_prompt_cache & prompt_cache) const {
        assert(server_cached_prompt.data.size() == 0);

        const size_t cur_size = llama_state_seq_get_size(ctx, id);

        LLAMA_LOG_INFO(" - saving prompt with length %d, total state size = %.3f MiB\n",
            (int)server_cached_prompt.tokens.size(), cur_size / (1024.0 * 1024.0));

        auto* cur = prompt_cache.alloc(server_cached_prompt, cur_size);
        if (cur == nullptr) {
            return;
        }

        llama_state_seq_get_data(ctx, cur->data.data(), cur_size, id);
    }

    void prompt_load(server_prompt_cache& prompt_cache, const server_tokens& tokens) {
        bool res = prompt_cache.load(server_cached_prompt, tokens, ctx, id);
        if (!res) {
            LLAMA_LOG_INFO("failed to load prompt from cache\n");
        }
    }


    // sampling
    llama_token sampled;
    struct llama_sampling_params sparams;
    llama_sampling_context * ctx_sampling = nullptr;
    json json_schema;

    common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
    std::vector<std::string> generated_tool_call_ids;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
    int32_t ga_w = 512; // group-attention width

    // multimodal
    mtmd_context * mctx = nullptr;

    // speculative decoding
    struct llama_speculative * spec = nullptr;
    llama_context * ctx_dft = nullptr;
    llama_batch batch_spec = {};

    // speculative decoding stats
    int32_t n_draft_total = 0;      // Total draft tokens generated
    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted

    int32_t n_past_se = 0; // self-extend

    // stats
    size_t n_sent_text = 0; // number of sent text character
    size_t n_sent_token_probs = 0;

    int64_t t_start_process_prompt;
    int64_t t_start_generation;

    double t_prompt_processing; // ms
    double t_token_generation; // ms

    void reset() {
        n_prompt_tokens    = 0;
        generated_text     = "";
        truncated          = false;
        stopped_eos        = false;
        stopped_word       = false;
        stopped_limit      = false;
        stopping_word      = "";
        n_past             = 0;
        n_sent_text        = 0;
        n_sent_token_probs = 0;
        infill             = false;
        ga_i               = 0;
        n_past_se          = 0;
        chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;

        generated_token_probs.clear();


        // Reset speculative decoding stats
        n_draft_total = 0;
        n_draft_accepted = 0;
        chat_msg = {};
        json_schema = json();
        generated_tool_call_ids.clear();

        task.reset();
    }

    bool has_budget(gpt_params &global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1) {
            return true; // limitless
        }

        n_remaining = -1;

        if (params.n_predict != -1) {
            n_remaining = params.n_predict - n_decoded;
        } else if (global_params.n_predict != -1) {
            n_remaining = global_params.n_predict - n_decoded;
        }

        return n_remaining > 0; // no budget
    }

    bool available() const {
        return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE;
    }

    bool is_processing() const {
        return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING;
    }

    void add_token_string(const completion_token_output & token) {
        if (command == SLOT_COMMAND_RELEASE) {
            return;
        }
        generated_token_probs.push_back(token);
    }

    void release() {
        if (state == SLOT_STATE_PROCESSING) {
            t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
            command = SLOT_COMMAND_RELEASE;
            task.reset();
        }
    }

    json get_formated_timings() const {
        return json {
            {"prompt_n",               n_prompt_tokens_processed},
            {"prompt_ms",              t_prompt_processing},
            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},

            {"predicted_n",            n_decoded},
            {"predicted_ms",           t_token_generation},
            {"predicted_per_token_ms", t_token_generation / n_decoded},
            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
        };
    }

    result_timings get_timings() const {
        result_timings timings;
        timings.prompt_n = n_prompt_tokens_processed;
        timings.prompt_ms = t_prompt_processing;
        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
        timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;