ggml-org · leonardcser · Feb 5, 2026 · Feb 5, 2026
@@ -2997,6 +2997,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_autoload = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
+    add_opt(common_arg(
+        {"--stop-idle-seconds"}, "SECONDS",
+        string_format("for router server, fully terminate model instances after N seconds of inactivity (default: %d; -1 = disabled)", params.stop_idle_seconds),
+        [](common_params & params, int value) {
+            if (value == 0 || value < -1) {
+                throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
+            }
+            params.stop_idle_seconds = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STOP_IDLE_SECONDS"));
     add_opt(common_arg(
         {"--jinja"},
         {"--no-jinja"},

@@ -557,6 +557,7 @@ struct common_params {
     std::string models_preset = ""; // directory containing model presets for the router server
     int models_max = 4;             // maximum number of models to load simultaneously
     bool models_autoload = true;    // automatically load models when requested via the router server
+    int stop_idle_seconds = -1;     // for router server, fully terminate idle model instances (-1 = disabled)
 
     bool log_json = false;
 

@@ -210,6 +210,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
 | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
 | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
+| `--stop-idle-seconds SECONDS` | for router server, fully terminate model instances after N seconds of inactivity (default: -1; -1 = disabled)<br/>(env: LLAMA_ARG_STOP_IDLE_SECONDS) |
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
@@ -1689,6 +1690,8 @@ The server supports an automatic sleep mode that activates after a specified per
 
 When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.
 
+**Note:** In router mode, `--sleep-idle-seconds` applies to each child server process individually (VRAM/RAM unload within the process). To fully terminate idle model subprocesses instead, use `--stop-idle-seconds`. When a terminated model is requested again, the router will re-spawn its process automatically (requires `--models-autoload`, which is enabled by default).
+
 The sleeping status can be retrieved from the `GET /props` endpoint (or `/props?model=(model_name)` in router mode).
 
 Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:

@@ -94,6 +94,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
     preset.unset_option("LLAMA_ARG_MODELS_MAX");
     preset.unset_option("LLAMA_ARG_MODELS_PRESET");
     preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
+    preset.unset_option("LLAMA_ARG_STOP_IDLE_SECONDS");
     if (unset_model_args) {
         preset.unset_option("LLAMA_ARG_MODEL");
         preset.unset_option("LLAMA_ARG_MMPROJ");
@@ -614,6 +615,49 @@ void server_models::unload(const std::string & name) {
     }
 }
 
+void server_models::start_idle_watchdog(int unload_idle_seconds) {
+    if (unload_idle_seconds < 0) {
+        return; // disabled
+    }
+    watchdog_running.store(true);
+    watchdog_thread = std::thread([this, unload_idle_seconds]() {
+        const int64_t threshold_ms = (int64_t)unload_idle_seconds * 1000;
+        SRV_INF("idle watchdog started, unload after %d seconds of inactivity\n", unload_idle_seconds);
+        while (watchdog_running.load()) {
+            // collect names of idle models under lock
+            std::vector<std::string> to_unload;
+            {
+                std::lock_guard<std::mutex> lk(mutex);
+                int64_t now = ggml_time_ms();
+                for (const auto & [name, inst] : mapping) {
+                    if (inst.meta.is_active() && inst.meta.last_used > 0 && (now - inst.meta.last_used) > threshold_ms) {
+                        to_unload.push_back(name);
+                    }
+                }
+            }
+            // unload outside lock (unload() acquires its own lock)
+            for (const auto & name : to_unload) {
+                SRV_INF("idle watchdog: unloading idle model name=%s\n", name.c_str());
+                unload(name);
+            }
+            // sleep in 100ms increments for responsive shutdown
+            for (int i = 0; i < 10 && watchdog_running.load(); i++) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+        }
+        SRV_INF("%s", "idle watchdog stopped\n");
+    });
+}
+
+void server_models::stop_idle_watchdog() {
+    if (watchdog_running.load()) {
+        watchdog_running.store(false);
+        if (watchdog_thread.joinable()) {
+            watchdog_thread.join();
+        }
+    }
+}
+
 void server_models::unload_all() {
     std::vector<std::thread> to_join;
     {

@@ -5,6 +5,7 @@
 #include "server-common.h"
 #include "server-http.h"
 
+#include <atomic>
 #include <mutex>
 #include <condition_variable>
 #include <functional>
@@ -96,6 +97,10 @@ struct server_models {
     std::vector<std::string> base_env;
     common_preset base_preset; // base preset from llama-server CLI args
 
+    // idle watchdog
+    std::thread watchdog_thread;
+    std::atomic<bool> watchdog_running{false};
+
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
@@ -135,6 +140,10 @@ struct server_models {
     // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
     bool ensure_model_loaded(const std::string & name);
 
+    // start/stop the idle watchdog thread that terminates idle model instances
+    void start_idle_watchdog(int unload_idle_seconds);
+    void stop_idle_watchdog();
+
     // proxy an HTTP request to the model instance
     server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
 

@@ -132,6 +132,8 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
+        models_routes->models.start_idle_watchdog(params.stop_idle_seconds);
+
         // proxy handlers
         // note: routes.get_health stays the same
         routes.get_metrics                 = models_routes->proxy_get;
@@ -210,6 +212,7 @@ int main(int argc, char ** argv) {
         clean_up = [&models_routes]() {
             SRV_INF("%s: cleaning up before exit...\n", __func__);
             if (models_routes.has_value()) {
+                models_routes->models.stop_idle_watchdog();
                 models_routes->models.unload_all();
             }
             llama_backend_free();

@@ -0,0 +1,89 @@
+import pytest
+import time
+from utils import *
+
+server: ServerProcess
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.router()
+
+
+def _get_model_status(model_id: str) -> str:
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    for item in res.body.get("data", []):
+        if item.get("id") == model_id or item.get("model") == model_id:
+            return item["status"]["value"]
+    raise AssertionError(f"Model {model_id} not found in /models response")
+
+
+def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
+    deadline = time.time() + timeout
+    last_status = None
+    while time.time() < deadline:
+        last_status = _get_model_status(model_id)
+        if last_status in desired:
+            return last_status
+        time.sleep(1)
+    raise AssertionError(
+        f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
+    )
+
+
+def _load_model_and_wait(model_id: str, timeout: int = 60) -> None:
+    load_res = server.make_request(
+        "POST", "/models/load", data={"model": model_id}
+    )
+    assert load_res.status_code == 200
+    assert isinstance(load_res.body, dict)
+    assert load_res.body.get("success") is True
+    _wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
+
+
+def test_router_stop_idle():
+    """Test that idle model instances are fully terminated after stop_idle_seconds."""
+    global server
+    server.stop_idle_seconds = 2
+    server.start()
+
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+
+    # load a model
+    _load_model_and_wait(model_id, timeout=120)
+    assert _get_model_status(model_id) == "loaded"
+
+    # wait for the idle watchdog to terminate it
+    _wait_for_model_status(model_id, {"unloaded"}, timeout=10)
+
+    # model should have been stopped
+    assert _get_model_status(model_id) == "unloaded"
+
+
+def test_router_stop_idle_respawn():
+    """Test that a stopped idle model is re-spawned on next request (autoload)."""
+    global server
+    server.stop_idle_seconds = 2
+    server.start()
+
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+
+    # load model, wait for idle stop
+    _load_model_and_wait(model_id, timeout=120)
+    _wait_for_model_status(model_id, {"unloaded"}, timeout=10)
+
+    # send a chat request - should trigger autoload and succeed
+    res = server.make_request("POST", "/chat/completions", data={
+        "model": model_id,
+        "max_tokens": 4,
+        "messages": [
+            {"role": "user", "content": "hello"},
+        ],
+    })
+    assert res.status_code == 200
+    assert "error" not in res.body
+
+    # model should be loaded again
+    assert _get_model_status(model_id) == "loaded"
@@ -101,6 +101,7 @@ class ServerProcess:
     mmproj_url: str | None = None
     media_path: str | None = None
     sleep_idle_seconds: int | None = None
+    stop_idle_seconds: int | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -233,6 +234,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--media-path", self.media_path])
         if self.sleep_idle_seconds is not None:
             server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
+        if self.stop_idle_seconds is not None:
+            server_args.extend(["--stop-idle-seconds", self.stop_idle_seconds])
 
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"tests: starting server with: {' '.join(args)}")