Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2997,6 +2997,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_autoload = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--stop-idle-seconds"}, "SECONDS",
string_format("for router server, fully terminate model instances after N seconds of inactivity (default: %d; -1 = disabled)", params.stop_idle_seconds),
[](common_params & params, int value) {
if (value == 0 || value < -1) {
throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
}
params.stop_idle_seconds = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STOP_IDLE_SECONDS"));
add_opt(common_arg(
{"--jinja"},
{"--no-jinja"},
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ struct common_params {
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
int stop_idle_seconds = -1; // for router server, fully terminate idle model instances (-1 = disabled)

bool log_json = false;

Expand Down
3 changes: 3 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
| `--stop-idle-seconds SECONDS` | for router server, fully terminate model instances after N seconds of inactivity (default: -1; -1 = disabled)<br/>(env: LLAMA_ARG_STOP_IDLE_SECONDS) |
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
Expand Down Expand Up @@ -1689,6 +1690,8 @@ The server supports an automatic sleep mode that activates after a specified per

When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.

**Note:** In router mode, `--sleep-idle-seconds` applies to each child server process individually (VRAM/RAM unload within the process). To fully terminate idle model subprocesses instead, use `--stop-idle-seconds`. When a terminated model is requested again, the router will re-spawn its process automatically (requires `--models-autoload`, which is enabled by default).

The sleeping status can be retrieved from the `GET /props` endpoint (or `/props?model=(model_name)` in router mode).

Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
Expand Down
44 changes: 44 additions & 0 deletions tools/server/server-models.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
preset.unset_option("LLAMA_ARG_MODELS_MAX");
preset.unset_option("LLAMA_ARG_MODELS_PRESET");
preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
preset.unset_option("LLAMA_ARG_STOP_IDLE_SECONDS");
if (unset_model_args) {
preset.unset_option("LLAMA_ARG_MODEL");
preset.unset_option("LLAMA_ARG_MMPROJ");
Expand Down Expand Up @@ -614,6 +615,49 @@ void server_models::unload(const std::string & name) {
}
}

void server_models::start_idle_watchdog(int unload_idle_seconds) {
if (unload_idle_seconds < 0) {
return; // disabled
}
watchdog_running.store(true);
watchdog_thread = std::thread([this, unload_idle_seconds]() {
const int64_t threshold_ms = (int64_t)unload_idle_seconds * 1000;
SRV_INF("idle watchdog started, unload after %d seconds of inactivity\n", unload_idle_seconds);
while (watchdog_running.load()) {
// collect names of idle models under lock
std::vector<std::string> to_unload;
{
std::lock_guard<std::mutex> lk(mutex);
int64_t now = ggml_time_ms();
for (const auto & [name, inst] : mapping) {
if (inst.meta.is_active() && inst.meta.last_used > 0 && (now - inst.meta.last_used) > threshold_ms) {
to_unload.push_back(name);
}
}
}
// unload outside lock (unload() acquires its own lock)
for (const auto & name : to_unload) {
SRV_INF("idle watchdog: unloading idle model name=%s\n", name.c_str());
unload(name);
}
// sleep in 100ms increments for responsive shutdown
for (int i = 0; i < 10 && watchdog_running.load(); i++) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
}
SRV_INF("%s", "idle watchdog stopped\n");
});
}

void server_models::stop_idle_watchdog() {
if (watchdog_running.load()) {
watchdog_running.store(false);
if (watchdog_thread.joinable()) {
watchdog_thread.join();
}
}
}

void server_models::unload_all() {
std::vector<std::thread> to_join;
{
Expand Down
9 changes: 9 additions & 0 deletions tools/server/server-models.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "server-common.h"
#include "server-http.h"

#include <atomic>
#include <mutex>
#include <condition_variable>
#include <functional>
Expand Down Expand Up @@ -96,6 +97,10 @@ struct server_models {
std::vector<std::string> base_env;
common_preset base_preset; // base preset from llama-server CLI args

// idle watchdog
std::thread watchdog_thread;
std::atomic<bool> watchdog_running{false};

void update_meta(const std::string & name, const server_model_meta & meta);

// unload least recently used models if the limit is reached
Expand Down Expand Up @@ -135,6 +140,10 @@ struct server_models {
// return false if model is already loaded; return true otherwise (meta may need to be refreshed)
bool ensure_model_loaded(const std::string & name);

// start/stop the idle watchdog thread that terminates idle model instances
void start_idle_watchdog(int unload_idle_seconds);
void stop_idle_watchdog();

// proxy an HTTP request to the model instance
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);

Expand Down
3 changes: 3 additions & 0 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ int main(int argc, char ** argv) {
return 1;
}

models_routes->models.start_idle_watchdog(params.stop_idle_seconds);

// proxy handlers
// note: routes.get_health stays the same
routes.get_metrics = models_routes->proxy_get;
Expand Down Expand Up @@ -210,6 +212,7 @@ int main(int argc, char ** argv) {
clean_up = [&models_routes]() {
SRV_INF("%s: cleaning up before exit...\n", __func__);
if (models_routes.has_value()) {
models_routes->models.stop_idle_watchdog();
models_routes->models.unload_all();
}
llama_backend_free();
Expand Down
89 changes: 89 additions & 0 deletions tools/server/tests/unit/test_stop_idle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pytest
import time
from utils import *

server: ServerProcess


@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.router()


def _get_model_status(model_id: str) -> str:
res = server.make_request("GET", "/models")
assert res.status_code == 200
for item in res.body.get("data", []):
if item.get("id") == model_id or item.get("model") == model_id:
return item["status"]["value"]
raise AssertionError(f"Model {model_id} not found in /models response")


def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
deadline = time.time() + timeout
last_status = None
while time.time() < deadline:
last_status = _get_model_status(model_id)
if last_status in desired:
return last_status
time.sleep(1)
raise AssertionError(
f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
)


def _load_model_and_wait(model_id: str, timeout: int = 60) -> None:
load_res = server.make_request(
"POST", "/models/load", data={"model": model_id}
)
assert load_res.status_code == 200
assert isinstance(load_res.body, dict)
assert load_res.body.get("success") is True
_wait_for_model_status(model_id, {"loaded"}, timeout=timeout)


def test_router_stop_idle():
"""Test that idle model instances are fully terminated after stop_idle_seconds."""
global server
server.stop_idle_seconds = 2
server.start()

model_id = "ggml-org/tinygemma3-GGUF:Q8_0"

# load a model
_load_model_and_wait(model_id, timeout=120)
assert _get_model_status(model_id) == "loaded"

# wait for the idle watchdog to terminate it
_wait_for_model_status(model_id, {"unloaded"}, timeout=10)

# model should have been stopped
assert _get_model_status(model_id) == "unloaded"


def test_router_stop_idle_respawn():
"""Test that a stopped idle model is re-spawned on next request (autoload)."""
global server
server.stop_idle_seconds = 2
server.start()

model_id = "ggml-org/tinygemma3-GGUF:Q8_0"

# load model, wait for idle stop
_load_model_and_wait(model_id, timeout=120)
_wait_for_model_status(model_id, {"unloaded"}, timeout=10)

# send a chat request - should trigger autoload and succeed
res = server.make_request("POST", "/chat/completions", data={
"model": model_id,
"max_tokens": 4,
"messages": [
{"role": "user", "content": "hello"},
],
})
assert res.status_code == 200
assert "error" not in res.body

# model should be loaded again
assert _get_model_status(model_id) == "loaded"
3 changes: 3 additions & 0 deletions tools/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ class ServerProcess:
mmproj_url: str | None = None
media_path: str | None = None
sleep_idle_seconds: int | None = None
stop_idle_seconds: int | None = None

# session variables
process: subprocess.Popen | None = None
Expand Down Expand Up @@ -233,6 +234,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
server_args.extend(["--media-path", self.media_path])
if self.sleep_idle_seconds is not None:
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
if self.stop_idle_seconds is not None:
server_args.extend(["--stop-idle-seconds", self.stop_idle_seconds])

args = [str(arg) for arg in [server_path, *server_args]]
print(f"tests: starting server with: {' '.join(args)}")
Expand Down