diff --git a/config.example.yaml b/config.example.yaml index c062ddd..e2fe99b 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -1,93 +1,191 @@ -# ====== -# For a more detailed configuration example: -# https://github.com/mostlygeek/llama-swap/wiki/Configuration -# ====== +# llama-swap YAML configuration example +# ------------------------------------- +# +# - Below are all the available configuration options for llama-swap. +# - Settings with a default value, or noted as optional can be omitted. +# - Settings that are marked required must be in your configuration file -# Seconds to wait for llama.cpp to be available to serve requests -# Default (and minimum): 15 seconds -healthCheckTimeout: 90 +# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests +# - optional, default: 120 +# - minimum value is 15 seconds, anything less will be set to this value +healthCheckTimeout: 500 -# valid log levels: debug, info (default), warn, error -logLevel: debug +# logLevel: sets the logging value +# - optional, default: info +# - Valid log levels: debug, info, warn, error +logLevel: info -# creating a coding profile with models for code generation and general questions -groups: - coding: - swap: false - members: - - "qwen" - - "llama" +# startPort: sets the starting port number for the automatic ${PORT} macro. +# - optional, default: 5800 +# - the ${PORT} macro can be used in model.cmd and model.proxy settings +# - it is automatically incremented for every model that uses it +startPort: 10001 +# macros: sets a dictionary of string:string pairs +# - optional, default: empty dictionary +# - these are reusable snippets +# - used in a model's cmd, cmdStop, proxy and checkEndpoint +# - useful for reducing common configuration settings +macros: + "latest-llama": > + /path/to/llama-server/llama-server-ec9e0301 + --port ${PORT} + +# models: a dictionary of model configurations +# - required +# - each key is the model's ID, used in API requests +# - model settings have default values that are used if they are not defined here +# - below are examples of the various settings a model can have: +# - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted models: + + # keys are the model names used in API requests "llama": + # cmd: the command to run to start the inference server. + # - required + # - it is just a string, similar to what you would run on the CLI + # - using `|` allows for comments in the command, these will be parsed out + # - macros can be used within cmd cmd: | - models/llama-server-osx - --port ${PORT} - -m models/Llama-3.2-1B-Instruct-Q4_0.gguf + # ${latest-llama} is a macro that is defined above + ${latest-llama} + --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf + + # env: define an array of environment variables to inject into cmd's environment + # - optional, default: empty array + # - each value is a single string + # - in the format: ENV_NAME=value + env: + - "CUDA_VISIBLE_DEVICES=0,1,2" - # list of model name aliases this llama.cpp instance can serve + # proxy: the URL where llama-swap routes API requests + # - optional, default: http://localhost:${PORT} + # - if you used ${PORT} in cmd this can be omitted + # - if you use a custom port in cmd this *must* be set + proxy: http://127.0.0.1:8999 + + # aliases: alternative model names that this model configuration is used for + # - optional, default: empty array + # - aliases must be unique globally + # - useful for impersonating a specific model aliases: - - gpt-4o-mini + - "gpt-4o-mini" + - "gpt-3.5-turbo" - # check this path for a HTTP 200 response for the server to be ready - checkEndpoint: /health + # checkEndpoint: URL path to check if the server is ready + # - optional, default: /health + # - use "none" to skip endpoint ready checking + # - endpoint is expected to return an HTTP 200 response + # - all requests wait until the endpoint is ready (or fails) + checkEndpoint: /custom-endpoint - # unload model after 5 seconds - ttl: 5 + # ttl: automatically unload the model after this many seconds + # - optional, default: 0 + # - ttl values must be a value greater than 0 + # - a value of 0 disables automatic unloading of the model + ttl: 60 - "qwen": - cmd: models/llama-server-osx --port ${PORT} -m models/qwen2.5-0.5b-instruct-q8_0.gguf - aliases: - - gpt-3.5-turbo + # useModelName: overrides the model name that is sent to upstream server + # - optional, default: "" + # - useful when the upstream server expects a specific model name or format + useModelName: "qwen:qwq" - # Embedding example with Nomic - # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF - "nomic": - cmd: | - models/llama-server-osx --port ${PORT} - -m models/nomic-embed-text-v1.5.Q8_0.gguf - --ctx-size 8192 - --batch-size 8192 - --rope-scaling yarn - --rope-freq-scale 0.75 - -ngl 99 - --embeddings - - # Reranking example with bge-reranker - # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF - "bge-reranker": - cmd: | - models/llama-server-osx --port ${PORT} - -m models/bge-reranker-v2-m3-Q4_K_M.gguf - --ctx-size 8192 - --reranking + # filters: a dictionary of filter settings + # - optional, default: empty dictionary + filters: + # strip_params: a comma separated list of parameters to remove from the request + # - optional, default: "" + # - useful for preventing overriding of default server params by requests + # - `model` parameter is never removed + # - can be any JSON key in the request body + # - recommended to stick to sampling parameters + strip_params: "temperature, top_p, top_k" + + # Unlisted model example: + "qwen-unlisted": + # unlisted: true or false + # - optional, default: false + # - unlisted models do not show up in /v1/models or /upstream lists + # - can be requested as normal through all apis + unlisted: true + cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 - # Docker Support (v26.1.4+ required!) - "dockertest": + # Docker example: + # container run times like Docker and Podman can also be used with a + # a combination of cmd and cmdStop. + "docker-llama": + proxy: "http://127.0.0.1:${PORT}" cmd: | docker run --name dockertest --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models - ghcr.io/ggerganov/llama.cpp:server + ghcr.io/ggml-org/llama.cpp:server --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' - "simple": - # example of setting environment variables - env: - - CUDA_VISIBLE_DEVICES=0,1 - - env1=hello - cmd: build/simple-responder --port ${PORT} - unlisted: true + # cmdStop: command to run to stop the model gracefully + # - optional, default: "" + # - useful for stopping commands managed by another system + # - on POSIX systems: a SIGTERM is sent for graceful shutdown + # - on Windows, taskkill is used + # - processes are given 5 seconds to shutdown until they are forcefully killed + # - the upstream's process id is available in the ${PID} macro + cmdStop: docker stop dockertest - # use "none" to skip check. Caution this may cause some requests to fail - # until the upstream server is ready for traffic - checkEndpoint: none +# groups: a dictionary of group settings +# - optional, default: empty dictionary +# - provide advanced controls over model swapping behaviour. +# - Using groups some models can be kept loaded indefinitely, while others are swapped out. +# - model ids must be defined in the Models section +# - a model can only be a member of one group +# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields +# - see issue #109 for details +# +# NOTE: the example below uses model names that are not defined above for demonstration purposes +groups: + # group1 is same as the default behaviour of llama-swap where only one model is allowed + # to run a time across the whole llama-swap instance + "group1": + # swap: controls the model swapping behaviour in within the group + # - optional, default: true + # - true : only one model is allowed to run at a time + # - false: all models can run together, no swapping + swap: true - # don't use these, just for testing if things are broken - "broken": - cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf - proxy: http://127.0.0.1:8999 - unlisted: true - "broken_timeout": - cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf - proxy: http://127.0.0.1:9000 - unlisted: true \ No newline at end of file + # exclusive: controls how the group affects other groups + # - optional, default: true + # - true: causes all other groups to unload when this group runs a model + # - false: does not affect other groups + exclusive: true + + # members references the models defined above + # required + members: + - "llama" + - "qwen-unlisted" + + # Example: + # - in this group all the models can run at the same time + # - when a different group loads all running models in this group are unloaded + "group2": + swap: false + exclusive: false + members: + - "docker-llama" + - "modelA" + - "modelB" + + # Example: + # - a persistent group, prevents other groups from unloading it + "forever": + # persistent: prevents over groups from unloading the models in this group + # - optional, default: false + # - does not affect individual model behaviour + persistent: true + + # set swap/exclusive to false to prevent swapping inside the group + # and the unloading of other groups + swap: false + exclusive: false + members: + - "forever-modelA" + - "forever-modelB" + - "forever-modelc" \ No newline at end of file diff --git a/misc/simple-responder/simple-responder.go b/misc/simple-responder/simple-responder.go index d06840d..d0198cc 100644 --- a/misc/simple-responder/simple-responder.go +++ b/misc/simple-responder/simple-responder.go @@ -42,9 +42,12 @@ func main() { time.Sleep(wait) } + bodyBytes, _ := io.ReadAll(c.Request.Body) + c.JSON(http.StatusOK, gin.H{ "responseMessage": *responseMessage, "h_content_length": c.Request.Header.Get("Content-Length"), + "request_body": string(bodyBytes), }) }) diff --git a/proxy/config.go b/proxy/config.go index e4b0a50..e9821e9 100644 --- a/proxy/config.go +++ b/proxy/config.go @@ -6,6 +6,7 @@ import ( "os" "regexp" "runtime" + "slices" "sort" "strconv" "strings" @@ -29,6 +30,9 @@ type ModelConfig struct { // Limit concurrency of HTTP requests to process ConcurrencyLimit int `yaml:"concurrencyLimit"` + + // Model filters see issue #174 + Filters ModelFilters `yaml:"filters"` } func (m *ModelConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { @@ -63,6 +67,46 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) { return SanitizeCommand(m.Cmd) } +// ModelFilters see issue #174 +type ModelFilters struct { + StripParams string `yaml:"strip_params"` +} + +func (m *ModelFilters) UnmarshalYAML(unmarshal func(interface{}) error) error { + type rawModelFilters ModelFilters + defaults := rawModelFilters{ + StripParams: "", + } + + if err := unmarshal(&defaults); err != nil { + return err + } + + *m = ModelFilters(defaults) + return nil +} + +func (f ModelFilters) SanitizedStripParams() ([]string, error) { + if f.StripParams == "" { + return nil, nil + } + + params := strings.Split(f.StripParams, ",") + cleaned := make([]string, 0, len(params)) + + for _, param := range params { + trimmed := strings.TrimSpace(param) + if trimmed == "model" || trimmed == "" { + continue + } + cleaned = append(cleaned, trimmed) + } + + // sort cleaned + slices.Sort(cleaned) + return cleaned, nil +} + type GroupConfig struct { Swap bool `yaml:"swap"` Exclusive bool `yaml:"exclusive"` @@ -212,6 +256,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) { modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue) modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue) modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue) + modelConfig.Filters.StripParams = strings.ReplaceAll(modelConfig.Filters.StripParams, macroSlug, macroValue) } // enforce ${PORT} used in both cmd and proxy diff --git a/proxy/config_posix_test.go b/proxy/config_posix_test.go index 2023eb7..4a62a99 100644 --- a/proxy/config_posix_test.go +++ b/proxy/config_posix_test.go @@ -83,6 +83,9 @@ models: assert.Equal(t, "", model1.UseModelName) assert.Equal(t, 0, model1.ConcurrencyLimit) } + + // default empty filter exists + assert.Equal(t, "", model1.Filters.StripParams) } func TestConfig_LoadPosix(t *testing.T) { diff --git a/proxy/config_test.go b/proxy/config_test.go index 49d4711..70a1d2d 100644 --- a/proxy/config_test.go +++ b/proxy/config_test.go @@ -300,3 +300,28 @@ models: }) } } + +func TestConfig_ModelFilters(t *testing.T) { + content := ` +macros: + default_strip: "temperature, top_p" +models: + model1: + cmd: path/to/cmd --port ${PORT} + filters: + strip_params: "model, top_k, ${default_strip}, , ," +` + config, err := LoadConfigFromReader(strings.NewReader(content)) + assert.NoError(t, err) + modelConfig, ok := config.Models["model1"] + if !assert.True(t, ok) { + t.FailNow() + } + + // make sure `model` and enmpty strings are not in the list + assert.Equal(t, "model, top_k, temperature, top_p, , ,", modelConfig.Filters.StripParams) + sanitized, err := modelConfig.Filters.SanitizedStripParams() + if assert.NoError(t, err) { + assert.Equal(t, []string{"temperature", "top_k", "top_p"}, sanitized) + } +} diff --git a/proxy/config_windows_test.go b/proxy/config_windows_test.go index d5cb50c..2f3fd30 100644 --- a/proxy/config_windows_test.go +++ b/proxy/config_windows_test.go @@ -80,6 +80,9 @@ models: assert.Equal(t, "", model1.UseModelName) assert.Equal(t, 0, model1.ConcurrencyLimit) } + + // default empty filter exists + assert.Equal(t, "", model1.Filters.StripParams) } func TestConfig_LoadWindows(t *testing.T) { diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go index 6855e37..268458b 100644 --- a/proxy/proxymanager.go +++ b/proxy/proxymanager.go @@ -365,6 +365,21 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) { } } + // issue #174 strip parameters from the JSON body + stripParams, err := pm.config.Models[realModelName].Filters.SanitizedStripParams() + if err != nil { // just log it and continue + pm.proxyLogger.Errorf("Error sanitizing strip params string: %s, %s", pm.config.Models[realModelName].Filters.StripParams, err.Error()) + } else { + for _, param := range stripParams { + pm.proxyLogger.Debugf("<%s> stripping param: %s", realModelName, param) + bodyBytes, err = sjson.DeleteBytes(bodyBytes, param) + if err != nil { + pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error deleting parameter %s from request", param)) + return + } + } + } + c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) // dechunk it as we already have all the body bytes see issue #11 diff --git a/proxy/proxymanager_test.go b/proxy/proxymanager_test.go index 32a007a..d62af6a 100644 --- a/proxy/proxymanager_test.go +++ b/proxy/proxymanager_test.go @@ -623,3 +623,37 @@ func TestProxyManager_ChatContentLength(t *testing.T) { assert.Equal(t, "81", response["h_content_length"]) assert.Equal(t, "model1", response["responseMessage"]) } + +func TestProxyManager_FiltersStripParams(t *testing.T) { + modelConfig := getTestSimpleResponderConfig("model1") + modelConfig.Filters = ModelFilters{ + StripParams: "temperature, model, stream", + } + + config := AddDefaultGroupToConfig(Config{ + HealthCheckTimeout: 15, + LogLevel: "error", + Models: map[string]ModelConfig{ + "model1": modelConfig, + }, + }) + + proxy := New(config) + defer proxy.StopProcesses(StopWaitForInflightRequest) + reqBody := `{"model":"model1", "temperature":0.1, "x_param":"123", "y_param":"abc", "stream":true}` + req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody)) + w := httptest.NewRecorder() + + proxy.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + var response map[string]string + assert.NoError(t, json.Unmarshal(w.Body.Bytes(), &response)) + + // `temperature` and `stream` are gone but model remains + assert.Equal(t, `{"model":"model1", "x_param":"123", "y_param":"abc"}`, response["request_body"]) + + // assert.Nil(t, response["temperature"]) + // assert.Equal(t, "123", response["x_param"]) + // assert.Equal(t, "abc", response["y_param"]) + // t.Logf("%v", response) +} diff --git a/ui/misc/logo.acorn b/ui/misc/logo.acorn new file mode 100644 index 0000000..f1185f3 Binary files /dev/null and b/ui/misc/logo.acorn differ