Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 13 additions & 140 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,158 +45,31 @@ llama-swap's configuration is purposefully simple.
```yaml
models:
"qwen2.5":
proxy: "http://127.0.0.1:9999"
cmd: |
/app/llama-server
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
--port 9999
--port ${PORT}

"smollm2":
proxy: "http://127.0.0.1:9999"
cmd: |
/app/llama-server
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
--port 9999
--port ${PORT}
```

<details>
<summary>But also very powerful ...</summary>

```yaml
# Seconds to wait for upstream to load and be ready to serve requests
# minimum is 15 seconds
# default is 120 seconds
healthCheckTimeout: 500

# Valid log levels: debug, info (default), warn, error
logLevel: info

# Automatic Port Values
# use ${PORT} in model.cmd and model.proxy to use an automatic port number
# when you use ${PORT} you can omit a custom model.proxy value, as it will
# default to http://localhost:${PORT}

# override the default port (5800) for automatic port values
startPort: 10001
But also very powerful:

# define valid model values and the upstream server start
models:
"llama":
# multiline for readability
cmd: |
llama-server --port 8999
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf

# environment variables to pass to the command
env:
- "CUDA_VISIBLE_DEVICES=0"

# where to reach the server started by cmd, make sure the ports match
# can be omitted if you use an automatic ${PORT} in cmd
proxy: http://127.0.0.1:8999

# aliases names to use this model for
aliases:
- "gpt-4o-mini"
- "gpt-3.5-turbo"

# check this path for an HTTP 200 OK before serving requests
# default: /health to match llama.cpp
# use "none" to skip endpoint checking, but may cause HTTP errors
# until the model is ready
checkEndpoint: /custom-endpoint

# automatically unload the model after this many seconds
# ttl values must be a value greater than 0
# default: 0 = never unload model
ttl: 60

# `useModelName` overrides the model name in the request
# and sends a specific name to the upstream server
useModelName: "qwen:qwq"

# unlisted models do not show up in /v1/models or /upstream lists
# but they can still be requested as normal
"qwen-unlisted":
unlisted: true
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

# Docker Support (v26.1.4+ required!)
"docker-llama":
proxy: "http://127.0.0.1:${PORT}"
cmd: |
docker run --name dockertest
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
ghcr.io/ggml-org/llama.cpp:server
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

# use a custom command to stop the model when swapping. By default
# this is SIGTERM on POSIX systems, and taskkill on Windows systems
# the ${PID} variable can be used in cmdStop, it will be automatically replaced
# with the PID of the running model
cmdStop: docker stop dockertest

# Groups provide advanced controls over model swapping behaviour. Using groups
# some models can be kept loaded indefinitely, while others are swapped out.
#
# Tips:
#
# - models must be defined above in the Models section
# - a model can only be a member of one group
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
# - see issue #109 for details
#
# NOTE: the example below uses model names that are not defined above for demonstration purposes
groups:
# group1 is the default behaviour of llama-swap where only one model is allowed
# to run a time across the whole llama-swap instance
"group1":
# swap controls the model swapping behaviour in within the group
# - true : only one model is allowed to run at a time
# - false: all models can run together, no swapping
swap: true

# exclusive controls how the group affects other groups
# - true: causes all other groups to unload their models when this group runs a model
# - false: does not affect other groups
exclusive: true

# members references the models defined above
members:
- "llama"
- "qwen-unlisted"

# models in this group are never unloaded
"group2":
swap: false
exclusive: false
members:
- "docker-llama"
# (not defined above, here for example)
- "modelA"
- "modelB"

"forever":
# setting persistent to true causes the group to never be affected by the swapping behaviour of
# other groups. It is a shortcut to keeping some models always loaded.
persistent: true

# set swap/exclusive to false to prevent swapping inside the group and effect on other groups
swap: false
exclusive: false
members:
- "forever-modelA"
- "forever-modelB"
- "forever-modelc"
```

### Use Case Examples
- ⚡ `groups` to run multiple models at once
- ⚡ `macros` for reusable snippets
- ⚡ `ttl` to automatically unload models
- ⚡ `aliases` to use familiar model names (e.g., "gpt-4o-mini")
- ⚡ `env` variables to pass custom environment to inference servers
- ⚡ `useModelName` to override model names sent to upstream servers
- ⚡ `healthCheckTimeout` to control model startup wait times
- ⚡ `${PORT}` automatic port variables for dynamic port assignment
- ⚡ Docker/podman compatible

- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest.
- [Restart on Config Change](examples/restart-on-config-change/README.md) - automatically restart llama-swap when trying out different configurations.
</details>
Check the [wiki](https://github.com/mostlygeek/llama-swap/wiki/Configuration) full documentation.

## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))

Expand Down
74 changes: 67 additions & 7 deletions proxy/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"io"
"os"
"regexp"
"runtime"
"sort"
"strconv"
Expand Down Expand Up @@ -67,6 +68,9 @@ type Config struct {
Profiles map[string][]string `yaml:"profiles"`
Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */

// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
Macros map[string]string `yaml:"macros"`

// map aliases to actual model IDs
aliases map[string]string

Expand Down Expand Up @@ -141,6 +145,30 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
}
}

/* check macro constraint rules:

- name must fit the regex ^[a-zA-Z0-9_-]+$
- names must be less than 64 characters (no reason, just cause)
- name can not be any reserved macros: PORT
- macro values must be less than 1024 characters
*/
macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
for macroName, macroValue := range config.Macros {
if len(macroName) >= 64 {
return Config{}, fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", macroName)
}
if !macroNameRegex.MatchString(macroName) {
return Config{}, fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", macroName)
}
if len(macroValue) >= 1024 {
return Config{}, fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", macroName)
}
switch macroName {
case "PORT":
return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
}
}

// Get and sort all model IDs first, makes testing more consistent
modelIds := make([]string, 0, len(config.Models))
for modelId := range config.Models {
Expand All @@ -151,19 +179,51 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
nextPort := config.StartPort
for _, modelId := range modelIds {
modelConfig := config.Models[modelId]
// iterate over the models and replace any ${PORT} with the next available port
if strings.Contains(modelConfig.Cmd, "${PORT}") {
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))

// go through model config fields: cmd, cmdStop, proxy, checkEndPoint and replace macros with macro values
for macroName, macroValue := range config.Macros {
macroSlug := fmt.Sprintf("${%s}", macroName)
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroValue)
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue)
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue)
modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue)
}

// only iterate over models that use ${PORT} to keep port numbers from increasing unnecessarily
if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") {
if modelConfig.Proxy == "" {
modelConfig.Proxy = fmt.Sprintf("http://localhost:%d", nextPort)
} else {
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", strconv.Itoa(nextPort))
modelConfig.Proxy = "http://localhost:${PORT}"
}

nextPortStr := strconv.Itoa(nextPort)
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", nextPortStr)
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${PORT}", nextPortStr)
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", nextPortStr)
nextPort++
config.Models[modelId] = modelConfig
} else if modelConfig.Proxy == "" {
return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
}

// make sure there are no unknown macros that have not been replaced
macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
fieldMap := map[string]string{
"cmd": modelConfig.Cmd,
"cmdStop": modelConfig.CmdStop,
"proxy": modelConfig.Proxy,
"checkEndpoint": modelConfig.CheckEndpoint,
}

for fieldName, fieldValue := range fieldMap {
matches := macroPattern.FindAllStringSubmatch(fieldValue, -1)
for _, match := range matches {
macroName := match[1]
if _, exists := config.Macros[macroName]; !exists {
return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
}
}
}

config.Models[modelId] = modelConfig
}

config = AddDefaultGroupToConfig(config)
Expand Down
Loading