Skip to content

Commit 1ac6499

Browse files
authored
Add macros to Configuration schema (#149)
* Add macros to Configuration schema * update docs
1 parent 25f3dc2 commit 1ac6499

File tree

3 files changed

+190
-149
lines changed

3 files changed

+190
-149
lines changed

README.md

Lines changed: 13 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -45,158 +45,31 @@ llama-swap's configuration is purposefully simple.
4545
```yaml
4646
models:
4747
"qwen2.5":
48-
proxy: "http://127.0.0.1:9999"
4948
cmd: |
5049
/app/llama-server
5150
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
52-
--port 9999
51+
--port ${PORT}
5352
5453
"smollm2":
55-
proxy: "http://127.0.0.1:9999"
5654
cmd: |
5755
/app/llama-server
5856
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
59-
--port 9999
57+
--port ${PORT}
6058
```
6159
62-
<details>
63-
<summary>But also very powerful ...</summary>
64-
65-
```yaml
66-
# Seconds to wait for upstream to load and be ready to serve requests
67-
# minimum is 15 seconds
68-
# default is 120 seconds
69-
healthCheckTimeout: 500
70-
71-
# Valid log levels: debug, info (default), warn, error
72-
logLevel: info
73-
74-
# Automatic Port Values
75-
# use ${PORT} in model.cmd and model.proxy to use an automatic port number
76-
# when you use ${PORT} you can omit a custom model.proxy value, as it will
77-
# default to http://localhost:${PORT}
78-
79-
# override the default port (5800) for automatic port values
80-
startPort: 10001
60+
But also very powerful:
8161
82-
# define valid model values and the upstream server start
83-
models:
84-
"llama":
85-
# multiline for readability
86-
cmd: |
87-
llama-server --port 8999
88-
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
89-
90-
# environment variables to pass to the command
91-
env:
92-
- "CUDA_VISIBLE_DEVICES=0"
93-
94-
# where to reach the server started by cmd, make sure the ports match
95-
# can be omitted if you use an automatic ${PORT} in cmd
96-
proxy: http://127.0.0.1:8999
97-
98-
# aliases names to use this model for
99-
aliases:
100-
- "gpt-4o-mini"
101-
- "gpt-3.5-turbo"
102-
103-
# check this path for an HTTP 200 OK before serving requests
104-
# default: /health to match llama.cpp
105-
# use "none" to skip endpoint checking, but may cause HTTP errors
106-
# until the model is ready
107-
checkEndpoint: /custom-endpoint
108-
109-
# automatically unload the model after this many seconds
110-
# ttl values must be a value greater than 0
111-
# default: 0 = never unload model
112-
ttl: 60
113-
114-
# `useModelName` overrides the model name in the request
115-
# and sends a specific name to the upstream server
116-
useModelName: "qwen:qwq"
117-
118-
# unlisted models do not show up in /v1/models or /upstream lists
119-
# but they can still be requested as normal
120-
"qwen-unlisted":
121-
unlisted: true
122-
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
123-
124-
# Docker Support (v26.1.4+ required!)
125-
"docker-llama":
126-
proxy: "http://127.0.0.1:${PORT}"
127-
cmd: |
128-
docker run --name dockertest
129-
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
130-
ghcr.io/ggml-org/llama.cpp:server
131-
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
132-
133-
# use a custom command to stop the model when swapping. By default
134-
# this is SIGTERM on POSIX systems, and taskkill on Windows systems
135-
# the ${PID} variable can be used in cmdStop, it will be automatically replaced
136-
# with the PID of the running model
137-
cmdStop: docker stop dockertest
138-
139-
# Groups provide advanced controls over model swapping behaviour. Using groups
140-
# some models can be kept loaded indefinitely, while others are swapped out.
141-
#
142-
# Tips:
143-
#
144-
# - models must be defined above in the Models section
145-
# - a model can only be a member of one group
146-
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
147-
# - see issue #109 for details
148-
#
149-
# NOTE: the example below uses model names that are not defined above for demonstration purposes
150-
groups:
151-
# group1 is the default behaviour of llama-swap where only one model is allowed
152-
# to run a time across the whole llama-swap instance
153-
"group1":
154-
# swap controls the model swapping behaviour in within the group
155-
# - true : only one model is allowed to run at a time
156-
# - false: all models can run together, no swapping
157-
swap: true
158-
159-
# exclusive controls how the group affects other groups
160-
# - true: causes all other groups to unload their models when this group runs a model
161-
# - false: does not affect other groups
162-
exclusive: true
163-
164-
# members references the models defined above
165-
members:
166-
- "llama"
167-
- "qwen-unlisted"
168-
169-
# models in this group are never unloaded
170-
"group2":
171-
swap: false
172-
exclusive: false
173-
members:
174-
- "docker-llama"
175-
# (not defined above, here for example)
176-
- "modelA"
177-
- "modelB"
178-
179-
"forever":
180-
# setting persistent to true causes the group to never be affected by the swapping behaviour of
181-
# other groups. It is a shortcut to keeping some models always loaded.
182-
persistent: true
183-
184-
# set swap/exclusive to false to prevent swapping inside the group and effect on other groups
185-
swap: false
186-
exclusive: false
187-
members:
188-
- "forever-modelA"
189-
- "forever-modelB"
190-
- "forever-modelc"
191-
```
192-
193-
### Use Case Examples
62+
- ⚡ `groups` to run multiple models at once
63+
- ⚡ `macros` for reusable snippets
64+
- ⚡ `ttl` to automatically unload models
65+
- ⚡ `aliases` to use familiar model names (e.g., "gpt-4o-mini")
66+
- ⚡ `env` variables to pass custom environment to inference servers
67+
- ⚡ `useModelName` to override model names sent to upstream servers
68+
- ⚡ `healthCheckTimeout` to control model startup wait times
69+
- ⚡ `${PORT}` automatic port variables for dynamic port assignment
70+
- ⚡ Docker/podman compatible
19471

195-
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
196-
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
197-
- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest.
198-
- [Restart on Config Change](examples/restart-on-config-change/README.md) - automatically restart llama-swap when trying out different configurations.
199-
</details>
72+
Check the [wiki](https://github.com/mostlygeek/llama-swap/wiki/Configuration) full documentation.
20073

20174
## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))
20275

proxy/config.go

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"io"
66
"os"
7+
"regexp"
78
"runtime"
89
"sort"
910
"strconv"
@@ -67,6 +68,9 @@ type Config struct {
6768
Profiles map[string][]string `yaml:"profiles"`
6869
Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */
6970

71+
// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
72+
Macros map[string]string `yaml:"macros"`
73+
7074
// map aliases to actual model IDs
7175
aliases map[string]string
7276

@@ -141,6 +145,30 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
141145
}
142146
}
143147

148+
/* check macro constraint rules:
149+
150+
- name must fit the regex ^[a-zA-Z0-9_-]+$
151+
- names must be less than 64 characters (no reason, just cause)
152+
- name can not be any reserved macros: PORT
153+
- macro values must be less than 1024 characters
154+
*/
155+
macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
156+
for macroName, macroValue := range config.Macros {
157+
if len(macroName) >= 64 {
158+
return Config{}, fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", macroName)
159+
}
160+
if !macroNameRegex.MatchString(macroName) {
161+
return Config{}, fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", macroName)
162+
}
163+
if len(macroValue) >= 1024 {
164+
return Config{}, fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", macroName)
165+
}
166+
switch macroName {
167+
case "PORT":
168+
return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
169+
}
170+
}
171+
144172
// Get and sort all model IDs first, makes testing more consistent
145173
modelIds := make([]string, 0, len(config.Models))
146174
for modelId := range config.Models {
@@ -151,19 +179,51 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
151179
nextPort := config.StartPort
152180
for _, modelId := range modelIds {
153181
modelConfig := config.Models[modelId]
154-
// iterate over the models and replace any ${PORT} with the next available port
155-
if strings.Contains(modelConfig.Cmd, "${PORT}") {
156-
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))
182+
183+
// go through model config fields: cmd, cmdStop, proxy, checkEndPoint and replace macros with macro values
184+
for macroName, macroValue := range config.Macros {
185+
macroSlug := fmt.Sprintf("${%s}", macroName)
186+
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroValue)
187+
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue)
188+
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue)
189+
modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue)
190+
}
191+
192+
// only iterate over models that use ${PORT} to keep port numbers from increasing unnecessarily
193+
if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") {
157194
if modelConfig.Proxy == "" {
158-
modelConfig.Proxy = fmt.Sprintf("http://localhost:%d", nextPort)
159-
} else {
160-
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", strconv.Itoa(nextPort))
195+
modelConfig.Proxy = "http://localhost:${PORT}"
161196
}
197+
198+
nextPortStr := strconv.Itoa(nextPort)
199+
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", nextPortStr)
200+
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${PORT}", nextPortStr)
201+
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", nextPortStr)
162202
nextPort++
163-
config.Models[modelId] = modelConfig
164203
} else if modelConfig.Proxy == "" {
165204
return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
166205
}
206+
207+
// make sure there are no unknown macros that have not been replaced
208+
macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
209+
fieldMap := map[string]string{
210+
"cmd": modelConfig.Cmd,
211+
"cmdStop": modelConfig.CmdStop,
212+
"proxy": modelConfig.Proxy,
213+
"checkEndpoint": modelConfig.CheckEndpoint,
214+
}
215+
216+
for fieldName, fieldValue := range fieldMap {
217+
matches := macroPattern.FindAllStringSubmatch(fieldValue, -1)
218+
for _, match := range matches {
219+
macroName := match[1]
220+
if _, exists := config.Macros[macroName]; !exists {
221+
return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
222+
}
223+
}
224+
}
225+
226+
config.Models[modelId] = modelConfig
167227
}
168228

169229
config = AddDefaultGroupToConfig(config)

0 commit comments

Comments
 (0)