Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 170 additions & 72 deletions config.example.yaml
Original file line number Diff line number Diff line change
@@ -1,93 +1,191 @@
# ======
# For a more detailed configuration example:
# https://github.com/mostlygeek/llama-swap/wiki/Configuration
# ======
# llama-swap YAML configuration example
# -------------------------------------
#
# - Below are all the available configuration options for llama-swap.
# - Settings with a default value, or noted as optional can be omitted.
# - Settings that are marked required must be in your configuration file

# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 90
# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
# - optional, default: 120
# - minimum value is 15 seconds, anything less will be set to this value
healthCheckTimeout: 500

# valid log levels: debug, info (default), warn, error
logLevel: debug
# logLevel: sets the logging value
# - optional, default: info
# - Valid log levels: debug, info, warn, error
logLevel: info

# creating a coding profile with models for code generation and general questions
groups:
coding:
swap: false
members:
- "qwen"
- "llama"
# startPort: sets the starting port number for the automatic ${PORT} macro.
# - optional, default: 5800
# - the ${PORT} macro can be used in model.cmd and model.proxy settings
# - it is automatically incremented for every model that uses it
startPort: 10001

# macros: sets a dictionary of string:string pairs
# - optional, default: empty dictionary
# - these are reusable snippets
# - used in a model's cmd, cmdStop, proxy and checkEndpoint
# - useful for reducing common configuration settings
macros:
"latest-llama": >
/path/to/llama-server/llama-server-ec9e0301
--port ${PORT}

# models: a dictionary of model configurations
# - required
# - each key is the model's ID, used in API requests
# - model settings have default values that are used if they are not defined here
# - below are examples of the various settings a model can have:
# - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted
models:

# keys are the model names used in API requests
"llama":
# cmd: the command to run to start the inference server.
# - required
# - it is just a string, similar to what you would run on the CLI
# - using `|` allows for comments in the command, these will be parsed out
# - macros can be used within cmd
cmd: |
models/llama-server-osx
--port ${PORT}
-m models/Llama-3.2-1B-Instruct-Q4_0.gguf
# ${latest-llama} is a macro that is defined above
${latest-llama}
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf

# env: define an array of environment variables to inject into cmd's environment
# - optional, default: empty array
# - each value is a single string
# - in the format: ENV_NAME=value
env:
- "CUDA_VISIBLE_DEVICES=0,1,2"

# list of model name aliases this llama.cpp instance can serve
# proxy: the URL where llama-swap routes API requests
# - optional, default: http://localhost:${PORT}
# - if you used ${PORT} in cmd this can be omitted
# - if you use a custom port in cmd this *must* be set
proxy: http://127.0.0.1:8999

# aliases: alternative model names that this model configuration is used for
# - optional, default: empty array
# - aliases must be unique globally
# - useful for impersonating a specific model
aliases:
- gpt-4o-mini
- "gpt-4o-mini"
- "gpt-3.5-turbo"

# check this path for a HTTP 200 response for the server to be ready
checkEndpoint: /health
# checkEndpoint: URL path to check if the server is ready
# - optional, default: /health
# - use "none" to skip endpoint ready checking
# - endpoint is expected to return an HTTP 200 response
# - all requests wait until the endpoint is ready (or fails)
checkEndpoint: /custom-endpoint

# unload model after 5 seconds
ttl: 5
# ttl: automatically unload the model after this many seconds
# - optional, default: 0
# - ttl values must be a value greater than 0
# - a value of 0 disables automatic unloading of the model
ttl: 60

"qwen":
cmd: models/llama-server-osx --port ${PORT} -m models/qwen2.5-0.5b-instruct-q8_0.gguf
aliases:
- gpt-3.5-turbo
# useModelName: overrides the model name that is sent to upstream server
# - optional, default: ""
# - useful when the upstream server expects a specific model name or format
useModelName: "qwen:qwq"

# Embedding example with Nomic
# https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
"nomic":
cmd: |
models/llama-server-osx --port ${PORT}
-m models/nomic-embed-text-v1.5.Q8_0.gguf
--ctx-size 8192
--batch-size 8192
--rope-scaling yarn
--rope-freq-scale 0.75
-ngl 99
--embeddings

# Reranking example with bge-reranker
# https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
"bge-reranker":
cmd: |
models/llama-server-osx --port ${PORT}
-m models/bge-reranker-v2-m3-Q4_K_M.gguf
--ctx-size 8192
--reranking
# filters: a dictionary of filter settings
# - optional, default: empty dictionary
filters:
# strip_params: a comma separated list of parameters to remove from the request
# - optional, default: ""
# - useful for preventing overriding of default server params by requests
# - `model` parameter is never removed
# - can be any JSON key in the request body
# - recommended to stick to sampling parameters
strip_params: "temperature, top_p, top_k"

# Unlisted model example:
"qwen-unlisted":
# unlisted: true or false
# - optional, default: false
# - unlisted models do not show up in /v1/models or /upstream lists
# - can be requested as normal through all apis
unlisted: true
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

# Docker Support (v26.1.4+ required!)
"dockertest":
# Docker example:
# container run times like Docker and Podman can also be used with a
# a combination of cmd and cmdStop.
"docker-llama":
proxy: "http://127.0.0.1:${PORT}"
cmd: |
docker run --name dockertest
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
ghcr.io/ggerganov/llama.cpp:server
ghcr.io/ggml-org/llama.cpp:server
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

"simple":
# example of setting environment variables
env:
- CUDA_VISIBLE_DEVICES=0,1
- env1=hello
cmd: build/simple-responder --port ${PORT}
unlisted: true
# cmdStop: command to run to stop the model gracefully
# - optional, default: ""
# - useful for stopping commands managed by another system
# - on POSIX systems: a SIGTERM is sent for graceful shutdown
# - on Windows, taskkill is used
# - processes are given 5 seconds to shutdown until they are forcefully killed
# - the upstream's process id is available in the ${PID} macro
cmdStop: docker stop dockertest

# use "none" to skip check. Caution this may cause some requests to fail
# until the upstream server is ready for traffic
checkEndpoint: none
# groups: a dictionary of group settings
# - optional, default: empty dictionary
# - provide advanced controls over model swapping behaviour.
# - Using groups some models can be kept loaded indefinitely, while others are swapped out.
# - model ids must be defined in the Models section
# - a model can only be a member of one group
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
# - see issue #109 for details
#
# NOTE: the example below uses model names that are not defined above for demonstration purposes
groups:
# group1 is same as the default behaviour of llama-swap where only one model is allowed
# to run a time across the whole llama-swap instance
"group1":
# swap: controls the model swapping behaviour in within the group
# - optional, default: true
# - true : only one model is allowed to run at a time
# - false: all models can run together, no swapping
swap: true

# don't use these, just for testing if things are broken
"broken":
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
proxy: http://127.0.0.1:8999
unlisted: true
"broken_timeout":
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
proxy: http://127.0.0.1:9000
unlisted: true
# exclusive: controls how the group affects other groups
# - optional, default: true
# - true: causes all other groups to unload when this group runs a model
# - false: does not affect other groups
exclusive: true

# members references the models defined above
# required
members:
- "llama"
- "qwen-unlisted"

# Example:
# - in this group all the models can run at the same time
# - when a different group loads all running models in this group are unloaded
"group2":
swap: false
exclusive: false
members:
- "docker-llama"
- "modelA"
- "modelB"

# Example:
# - a persistent group, prevents other groups from unloading it
"forever":
# persistent: prevents over groups from unloading the models in this group
# - optional, default: false
# - does not affect individual model behaviour
persistent: true

# set swap/exclusive to false to prevent swapping inside the group
# and the unloading of other groups
swap: false
exclusive: false
members:
- "forever-modelA"
- "forever-modelB"
- "forever-modelc"
3 changes: 3 additions & 0 deletions misc/simple-responder/simple-responder.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,12 @@ func main() {
time.Sleep(wait)
}

bodyBytes, _ := io.ReadAll(c.Request.Body)

Comment on lines +45 to +46
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for request body reading.

The error from io.ReadAll is ignored, which could lead to unreliable test behavior if reading the request body fails.

-	bodyBytes, _ := io.ReadAll(c.Request.Body)
+	bodyBytes, err := io.ReadAll(c.Request.Body)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+		return
+	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
bodyBytes, _ := io.ReadAll(c.Request.Body)
- bodyBytes, _ := io.ReadAll(c.Request.Body)
+ bodyBytes, err := io.ReadAll(c.Request.Body)
+ if err != nil {
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read request body"})
+ return
+ }
🤖 Prompt for AI Agents
In misc/simple-responder/simple-responder.go around lines 45 to 46, the error
returned by io.ReadAll when reading the request body is ignored. Modify the code
to check the error returned by io.ReadAll and handle it appropriately, such as
returning an error response or logging the error, to ensure reliable behavior
when reading the request body fails.

c.JSON(http.StatusOK, gin.H{
"responseMessage": *responseMessage,
"h_content_length": c.Request.Header.Get("Content-Length"),
"request_body": string(bodyBytes),
})
})

Expand Down
45 changes: 45 additions & 0 deletions proxy/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"regexp"
"runtime"
"slices"
"sort"
"strconv"
"strings"
Expand All @@ -29,6 +30,9 @@ type ModelConfig struct {

// Limit concurrency of HTTP requests to process
ConcurrencyLimit int `yaml:"concurrencyLimit"`

// Model filters see issue #174
Filters ModelFilters `yaml:"filters"`
}

func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
Expand Down Expand Up @@ -63,6 +67,46 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) {
return SanitizeCommand(m.Cmd)
}

// ModelFilters see issue #174
type ModelFilters struct {
StripParams string `yaml:"strip_params"`
}

func (m *ModelFilters) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rawModelFilters ModelFilters
defaults := rawModelFilters{
StripParams: "",
}

if err := unmarshal(&defaults); err != nil {
return err
}

*m = ModelFilters(defaults)
return nil
}

func (f ModelFilters) SanitizedStripParams() ([]string, error) {
if f.StripParams == "" {
return nil, nil
}

params := strings.Split(f.StripParams, ",")
cleaned := make([]string, 0, len(params))

for _, param := range params {
trimmed := strings.TrimSpace(param)
if trimmed == "model" || trimmed == "" {
continue
}
cleaned = append(cleaned, trimmed)
}

// sort cleaned
slices.Sort(cleaned)
return cleaned, nil
}

type GroupConfig struct {
Swap bool `yaml:"swap"`
Exclusive bool `yaml:"exclusive"`
Expand Down Expand Up @@ -212,6 +256,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue)
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue)
modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue)
modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroValue)
}

// enforce ${PORT} used in both cmd and proxy
Expand Down
3 changes: 3 additions & 0 deletions proxy/config_posix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ models:
assert.Equal(t, "", model1.UseModelName)
assert.Equal(t, 0, model1.ConcurrencyLimit)
}

// default empty filter exists
assert.Equal(t, "", model1.Filters.StripParams)
}

func TestConfig_LoadPosix(t *testing.T) {
Expand Down
25 changes: 25 additions & 0 deletions proxy/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,3 +300,28 @@ models:
})
}
}

func TestConfig_ModelFilters(t *testing.T) {
content := `
macros:
default_strip: "temperature, top_p"
models:
model1:
cmd: path/to/cmd --port ${PORT}
filters:
strip_params: "model, top_k, ${default_strip}, , ,"
`
config, err := LoadConfigFromReader(strings.NewReader(content))
assert.NoError(t, err)
modelConfig, ok := config.Models["model1"]
if !assert.True(t, ok) {
t.FailNow()
}

// make sure `model` and enmpty strings are not in the list
assert.Equal(t, "model, top_k, temperature, top_p, , ,", modelConfig.Filters.StripParams)
sanitized, err := modelConfig.Filters.SanitizedStripParams()
if assert.NoError(t, err) {
assert.Equal(t, []string{"temperature", "top_k", "top_p"}, sanitized)
}
}
Loading