33#
44# 💡 Tip - Use an LLM with this file!
55# ====================================
6- # This example configuration is written to be LLM friendly! Try
6+ # This example configuration is written to be LLM friendly. Try
77# copying this file into an LLM and asking it to explain or generate
88# sections for you.
99# ====================================
10- #
10+
11+ # Usage notes:
1112# - Below are all the available configuration options for llama-swap.
12- # - Settings with a default value, or noted as optional can be omitted.
13- # - Settings that are marked required must be in your configuration file
13+ # - Settings noted as "required" must be in your configuration file
14+ # - Settings noted as "optional" can be omitted
1415
1516# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
1617# - optional, default: 120
@@ -34,9 +35,9 @@ metricsMaxInMemory: 1000
3435# - it is automatically incremented for every model that uses it
3536startPort : 10001
3637
37- # macros: sets a dictionary of string:string pairs
38+ # macros: a dictionary of string substitutions
3839# - optional, default: empty dictionary
39- # - these are reusable snippets
40+ # - macros are reusable snippets
4041# - used in a model's cmd, cmdStop, proxy and checkEndpoint
4142# - useful for reducing common configuration settings
4243macros :
@@ -99,44 +100,46 @@ models:
99100
100101 # checkEndpoint: URL path to check if the server is ready
101102 # - optional, default: /health
102- # - use "none" to skip endpoint ready checking
103103 # - endpoint is expected to return an HTTP 200 response
104- # - all requests wait until the endpoint is ready (or fails)
104+ # - all requests wait until the endpoint is ready or fails
105+ # - use "none" to skip endpoint health checking
105106 checkEndpoint : /custom-endpoint
106107
107- # ttl: automatically unload the model after this many seconds
108+ # ttl: automatically unload the model after ttl seconds
108109 # - optional, default: 0
109110 # - ttl values must be a value greater than 0
110111 # - a value of 0 disables automatic unloading of the model
111112 ttl : 60
112113
113- # useModelName: overrides the model name that is sent to upstream server
114+ # useModelName: override the model name that is sent to upstream server
114115 # - optional, default: ""
115- # - useful when the upstream server expects a specific model name or format
116+ # - useful for when the upstream server expects a specific model name that
117+ # is different from the model's ID
116118 useModelName : " qwen:qwq"
117119
118120 # filters: a dictionary of filter settings
119121 # - optional, default: empty dictionary
122+ # - only strip_params is currently supported
120123 filters :
121124 # strip_params: a comma separated list of parameters to remove from the request
122125 # - optional, default: ""
123- # - useful for preventing overriding of default server params by requests
124- # - `model` parameter is never removed
126+ # - useful for server side enforcement of sampling parameters
127+ # - the `model` parameter can never be removed
125128 # - can be any JSON key in the request body
126129 # - recommended to stick to sampling parameters
127130 strip_params : " temperature, top_p, top_k"
128131
129132 # Unlisted model example:
130133 " qwen-unlisted " :
131- # unlisted: true or false
134+ # unlisted: boolean, true or false
132135 # - optional, default: false
133- # - unlisted models do not show up in /v1/models or /upstream lists
136+ # - unlisted models do not show up in /v1/models api requests
134137 # - can be requested as normal through all apis
135138 unlisted : true
136139 cmd : llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
137140
138141 # Docker example:
139- # container run times like Docker and Podman can also be used with a
142+ # container run times like Docker and Podman can be used reliably with a
140143 # a combination of cmd and cmdStop.
141144 " docker-llama " :
142145 proxy : " http://127.0.0.1:${PORT}"
@@ -149,24 +152,26 @@ models:
149152 # cmdStop: command to run to stop the model gracefully
150153 # - optional, default: ""
151154 # - useful for stopping commands managed by another system
152- # - on POSIX systems: a SIGTERM is sent for graceful shutdown
153- # - on Windows, taskkill is used
154- # - processes are given 5 seconds to shutdown until they are forcefully killed
155155 # - the upstream's process id is available in the ${PID} macro
156+ #
157+ # When empty, llama-swap has this default behaviour:
158+ # - on POSIX systems: a SIGTERM signal is sent
159+ # - on Windows, calls taskkill to stop the process
160+ # - processes have 5 seconds to shutdown until forceful termination is attempted
156161 cmdStop : docker stop dockertest
157162
158163# groups: a dictionary of group settings
159164# - optional, default: empty dictionary
160- # - provide advanced controls over model swapping behaviour.
161- # - Using groups some models can be kept loaded indefinitely, while others are swapped out.
162- # - model ids must be defined in the Models section
165+ # - provides advanced controls over model swapping behaviour
166+ # - using groups some models can be kept loaded indefinitely, while others are swapped out
167+ # - model IDs must be defined in the Models section
163168# - a model can only be a member of one group
164169# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
165170# - see issue #109 for details
166171#
167172# NOTE: the example below uses model names that are not defined above for demonstration purposes
168173groups :
169- # group1 is same as the default behaviour of llama-swap where only one model is allowed
174+ # group1 works the same as the default behaviour of llama-swap where only one model is allowed
170175 # to run a time across the whole llama-swap instance
171176 " group1 " :
172177 # swap: controls the model swapping behaviour in within the group
@@ -188,10 +193,13 @@ groups:
188193 - " qwen-unlisted"
189194
190195 # Example:
191- # - in this group all the models can run at the same time
192- # - when a different group loads all running models in this group are unloaded
196+ # - in group2 all models can run at the same time
197+ # - when a different group is loaded it causes all running models in this group to unload
193198 " group2 " :
194199 swap : false
200+
201+ # exclusive: false does not unload other groups when a model in group2 is requested
202+ # - the models in group2 will be loaded but will not unload any other groups
195203 exclusive : false
196204 members :
197205 - " docker-llama"
@@ -220,7 +228,7 @@ groups:
220228# - the only supported hook is on_startup
221229hooks :
222230 # on_startup: a dictionary of actions to perform on startup
223- # - optional, default: empty dictionar
231+ # - optional, default: empty dictionary
224232 # - the only supported action is preload
225233 on_startup :
226234 # preload: a list of model ids to load on startup
@@ -229,4 +237,4 @@ hooks:
229237 # - when preloading multiple models at once, define a group
230238 # otherwise models will be loaded and swapped out
231239 preload :
232- - " llama"
240+ - " llama"
0 commit comments