Skip to content
10 changes: 10 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2887,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.lora_init_without_apply = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--sleep-idle-seconds"}, "SECONDS",
string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
[](common_params & params, int value) {
if (value == 0 || value < -1) {
throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
}
params.sleep_idle_seconds = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--simple-io"},
"use basic IO for better compatibility in subprocesses and limited consoles",
Expand Down
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,8 @@ struct common_params {
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time

std::vector<std::string> api_keys;

Expand Down
2 changes: 0 additions & 2 deletions tools/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,6 @@ int main(int argc, char ** argv) {
return 1;
}

ctx_cli.ctx_server.init();

console::spinner::stop();
console::log("\n");

Expand Down
10 changes: 10 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1621,6 +1621,16 @@ Example of an error:
}
```

## Sleeping on Idle

The server supports an automatic sleep mode that activates after a specified period of inactivity (no incoming tasks). This feature, introduced in [PR #18228](https://github.com/ggml-org/llama.cpp/pull/18228), can be enabled using the `--sleep-idle-seconds` command-line argument. It works seamlessly in both single-model and multi-model configurations.

When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.

Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
- `GET /health`
- `GET /props`

## More examples

### Interactive mode
Expand Down
205 changes: 130 additions & 75 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,9 @@ struct server_context_impl {

server_metrics metrics;

json webui_settings = json::object();
// cached responses for HTTP API
json json_server_props = json::object();
// json json_server_models = json::object(); // TODO

// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
Expand All @@ -554,7 +556,15 @@ struct server_context_impl {
common_chat_templates_ptr chat_templates;
oaicompat_parser_options oai_parser_opt;

bool sleeping = false;

~server_context_impl() {
if (!sleeping) {
destroy();
}
}
Comment on lines 561 to +567
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Potential resource leak if destroyed while sleeping.

When the server is in sleeping state, the destructor skips calling destroy(). However, destroy() also cleans up resources beyond llama_init, including:

  • mctx (multimodal context)
  • Draft contexts and speculative decoding state in slots (ctx_dft, spec)
  • batch memory

These resources will leak if the server is destroyed while sleeping.

🔎 Proposed fix
 ~server_context_impl() {
-    if (!sleeping) {
-        destroy();
-    }
+    destroy();
 }

And update destroy() to be idempotent:

 void destroy() {
+    if (!llama_init && !mctx) {
+        return; // already destroyed
+    }
     llama_init.reset();
     ctx = nullptr;
     model = nullptr;

     mtmd_free(mctx);
     mctx = nullptr;

     // ... rest of cleanup
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
~server_context_impl() {
if (!sleeping) {
destroy();
}
}
~server_context_impl() {
destroy();
}
🤖 Prompt for AI Agents
In tools/server/server-context.cpp around lines 561 to 565, the destructor
currently skips calling destroy() when sleeping which allows resources (mctx,
per-slot ctx_dft and spec, batch memory, etc.) to leak; always invoke destroy()
from the destructor and make destroy() idempotent: have destroy() check and
early-return if already destroyed, free/release mctx, iterate slots to free
ctx_dft and spec and clear any speculative state, free batch memory, stop/join
any background threads, null out pointers and reset flags after freeing, and
guard all frees with null checks to prevent double-free.


void destroy() {
mtmd_free(mctx);

// Clear any sampling context
Expand All @@ -571,22 +581,43 @@ struct server_context_impl {
llama_batch_free(batch);
}

void handle_sleeping_state(bool new_state) {
GGML_ASSERT(sleeping != new_state);
if (new_state) {
SRV_INF("%s", "server is entering sleeping state\n");
destroy();
} else {
SRV_INF("%s", "server is exiting sleeping state\n");
if (!load_model(params_base)) {
SRV_ERR("%s", "fatal: failed to reload model after sleeping\n");
exit(1);
}
}
sleeping = new_state;
}

// load the model and initialize llama_context
// this may also be called to resume from sleeping state
bool load_model(const common_params & params) {
bool is_resume = sleeping;

if (!is_resume) {
// wiring up server queues
queue_tasks.on_new_task([this](server_task && task) {
process_single_task(std::move(task));
});
queue_tasks.on_update_slots([this]() {
update_slots();
});
queue_tasks.on_sleeping_state([this](bool sleeping) {
handle_sleeping_state(sleeping);
});
}

SRV_INF("loading model '%s'\n", params.model.path.c_str());

params_base = params;

webui_settings = json::object();
if (!params_base.webui_config_json.empty()) {
try {
webui_settings = json::parse(params_base.webui_config_json);
} catch (const std::exception & e) {
SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
return false;
}
}

llama_init = common_init_from_params(params_base);

model = llama_init->model();
Expand Down Expand Up @@ -654,7 +685,9 @@ struct server_context_impl {

std::string & mmproj_path = params_base.mmproj.path;
if (!mmproj_path.empty()) {
mtmd_helper_log_set(common_log_default_callback, nullptr);
if (!is_resume) {
mtmd_helper_log_set(common_log_default_callback, nullptr);
}

mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params_base.mmproj_use_gpu;
Expand Down Expand Up @@ -699,19 +732,6 @@ struct server_context_impl {
}
}

return true;
}

// initialize slots and server-related data
void init() {
// wiring up server queues
queue_tasks.on_new_task([this](server_task && task) {
process_single_task(std::move(task));
});
queue_tasks.on_update_slots([this]() {
update_slots();
});

// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;

Expand All @@ -726,6 +746,7 @@ struct server_context_impl {
n_ctx_slot = n_ctx_train;
}

slots.clear();
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;

Expand All @@ -742,13 +763,13 @@ struct server_context_impl {
slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
if (slot.ctx_dft == nullptr) {
SRV_ERR("%s", "failed to create draft context\n");
return;
return false;
}

slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
if (slot.spec == nullptr) {
SRV_ERR("%s", "failed to create speculator\n");
return;
return false;
}
for (auto & pair : params_base.speculative.replacements) {
common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
Expand Down Expand Up @@ -782,6 +803,12 @@ struct server_context_impl {
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
}

if (is_resume) {
return true;
}

// everything below this line is only for fresh model load

metrics.init();

if (params_base.cache_ram_mib != 0) {
Expand Down Expand Up @@ -832,6 +859,65 @@ struct server_context_impl {
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
common_chat_templates_source(chat_templates.get()),
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());

if (!populate_json_responses()) {
SRV_ERR("%s", "failed to populate JSON responses\n");
return false;
}

return true;
}

bool populate_json_responses() {
// populate webui settings
json json_webui_settings = json::object();
{
if (!params_base.webui_config_json.empty()) {
try {
json_webui_settings = json::parse(params_base.webui_config_json);
} catch (const std::exception & e) {
SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
return false;
}
}
}

// populate server properties
{
task_params params;
params.sampling = params_base.sampling;
json default_generation_settings_for_props = json {
{"params", params.to_json(true)},
{"n_ctx", get_slot_n_ctx()},
};

json_server_props = {
{ "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", params_base.n_parallel },
{ "model_alias", model_name },
{ "model_path", params_base.model.path },
{ "modalities", json {
{"vision", oai_parser_opt.allow_image},
{"audio", oai_parser_opt.allow_audio},
} },
{ "endpoint_slots", params_base.endpoint_slots },
{ "endpoint_props", params_base.endpoint_props },
{ "endpoint_metrics", params_base.endpoint_metrics },
{ "webui", params_base.webui },
{ "webui_settings", json_webui_settings },
{ "chat_template", common_chat_templates_source(chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx, llama_vocab_bos(vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx, llama_vocab_eos(vocab), /* special= */ true)},
{ "build_info", build_info },
};
if (params_base.use_jinja) {
if (auto tool_use_src = common_chat_templates_source(chat_templates.get(), "tool_use")) {
json_server_props["chat_template_tool_use"] = tool_use_src;
}
}
}

return true;
}

server_slot * get_slot_by_id(int id) {
Expand Down Expand Up @@ -2662,16 +2748,13 @@ struct server_context_impl {
server_context::server_context() : impl(new server_context_impl()) {}
server_context::~server_context() = default;

void server_context::init() {
impl->init();
}

bool server_context::load_model(const common_params & params) {
return impl->load_model(params);
}

void server_context::start_loop() {
impl->queue_tasks.start_loop();
auto & params = impl->params_base;
impl->queue_tasks.start_loop(params.sleep_idle_seconds * 1000);
}

void server_context::terminate() {
Expand All @@ -2698,10 +2781,17 @@ server_context_info server_context::get_info() const {


// generator-like API for HTTP response generation
// may have bypass_sleep = true if the task does not use ctx_server
struct server_res_generator : server_http_res {
server_response_reader rd;
server_res_generator(server_context_impl & ctx_server)
: rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {}
server_res_generator(server_context_impl & ctx_server, bool bypass_sleep = false)
: rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {
// fast path in case sleeping is disabled
bypass_sleep |= ctx_server.params_base.sleep_idle_seconds < 0;
if (!bypass_sleep) {
ctx_server.queue_tasks.wait_until_no_sleep();
}
}
void ok(const json & response_data) {
status = 200;
data = safe_json_to_str(response_data);
Expand Down Expand Up @@ -2933,7 +3023,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
void server_routes::init_routes() {
this->get_health = [this](const server_http_req &) {
// error and loading states are handled by middleware
auto res = std::make_unique<server_res_generator>(ctx_server);
auto res = std::make_unique<server_res_generator>(ctx_server, true);
res->ok({{"status", "ok"}});
return res;
};
Expand Down Expand Up @@ -3115,46 +3205,10 @@ void server_routes::init_routes() {
};

this->get_props = [this](const server_http_req &) {
auto res = std::make_unique<server_res_generator>(ctx_server);
json default_generation_settings_for_props;

{
task_params params;

params.sampling = ctx_server.params_base.sampling;

default_generation_settings_for_props = json {
{"params", params.to_json(true)},
{"n_ctx", ctx_server.get_slot_n_ctx()},
};
}

json data = {
{ "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_alias", ctx_server.model_name },
{ "model_path", ctx_server.params_base.model.path },
{ "modalities", json {
{"vision", ctx_server.oai_parser_opt.allow_image},
{"audio", ctx_server.oai_parser_opt.allow_audio},
} },
{ "endpoint_slots", params.endpoint_slots },
{ "endpoint_props", params.endpoint_props },
{ "endpoint_metrics", params.endpoint_metrics },
{ "webui", params.webui },
{ "webui_settings", ctx_server.webui_settings },
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
{ "build_info", build_info },
};
if (ctx_server.params_base.use_jinja) {
if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
data["chat_template_tool_use"] = tool_use_src;
}
}

res->ok(data);
auto res = std::make_unique<server_res_generator>(ctx_server, true);
auto props = ctx_server.json_server_props;
props["is_sleeping"] = ctx_server.queue_tasks.is_sleeping();
res->ok(props);
return res;
};

Expand Down Expand Up @@ -3365,6 +3419,7 @@ void server_routes::init_routes() {
return res;
};

// TODO: allow this endpoint to be accessed bypassing sleep mode, same method as get_props
this->get_models = [this](const server_http_req &) {
auto res = std::make_unique<server_res_generator>(ctx_server);
json model_meta = nullptr;
Expand Down
3 changes: 0 additions & 3 deletions tools/server/server-context.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ struct server_context {
server_context();
~server_context();

// initialize slots and server-related data
void init();

// load the model and initialize llama_context
// returns true on success
bool load_model(const common_params & params);
Expand Down
Loading
Loading