mostlygeek · mostlygeek · Jul 22, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/config.example.yaml b/config.example.yaml
@@ -15,6 +15,12 @@ healthCheckTimeout: 500
 # - Valid log levels: debug, info, warn, error
 logLevel: info
 
+# metricsMaxInMemory: maximum number of metrics to keep in memory
+# - optional, default: 1000
+# - controls how many metrics are stored in memory before older ones are discarded
+# - useful for limiting memory usage when processing large volumes of metrics
+metricsMaxInMemory: 1000
+
 # startPort: sets the starting port number for the automatic ${PORT} macro.
 # - optional, default: 5800
 # - the ${PORT} macro can be used in model.cmd and model.proxy settings
@@ -200,4 +206,4 @@ groups:
     members:
       - "forever-modelA"
       - "forever-modelB"
-      - "forever-modelc"
+      - "forever-modelc"
diff --git a/docker/config.example.yaml b/docker/config.example.yaml
@@ -1,5 +1,6 @@
 healthCheckTimeout: 300
 logRequests: true
+metricsMaxInMemory: 1000
 
 models:
   "qwen2.5":

diff --git a/misc/simple-responder/simple-responder.go b/misc/simple-responder/simple-responder.go
@@ -48,6 +48,11 @@ func main() {
 			"responseMessage":  *responseMessage,
 			"h_content_length": c.Request.Header.Get("Content-Length"),
 			"request_body":     string(bodyBytes),
+			"usage": gin.H{
+				"completion_tokens": 10,
+				"prompt_tokens":     25,
+				"total_tokens":      35,
+			},
 		})
 	})
 
@@ -74,6 +79,11 @@ func main() {
 		c.Header("Content-Type", "application/json")
 		c.JSON(http.StatusOK, gin.H{
 			"responseMessage": *responseMessage,
+			"usage": gin.H{
+				"completion_tokens": 10,
+				"prompt_tokens":     25,
+				"total_tokens":      35,
+			},
 		})
 
 	})

diff --git a/proxy/config.go b/proxy/config.go
@@ -142,6 +142,7 @@ type Config struct {
 	HealthCheckTimeout int                    `yaml:"healthCheckTimeout"`
 	LogRequests        bool                   `yaml:"logRequests"`
 	LogLevel           string                 `yaml:"logLevel"`
+	MetricsMaxInMemory int                    `yaml:"metricsMaxInMemory"`
 	Models             map[string]ModelConfig `yaml:"models"` /* key is model ID */
 	Profiles           map[string][]string    `yaml:"profiles"`
 	Groups             map[string]GroupConfig `yaml:"groups"` /* key is group ID */
@@ -194,6 +195,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
 		HealthCheckTimeout: 120,
 		StartPort:          5800,
 		LogLevel:           "info",
+		MetricsMaxInMemory: 1000,
 	}
 	err = yaml.Unmarshal(data, &config)
 	if err != nil {

diff --git a/proxy/config_posix_test.go b/proxy/config_posix_test.go
@@ -196,6 +196,7 @@ groups:
 			},
 		},
 		HealthCheckTimeout: 15,
+		MetricsMaxInMemory: 1000,
 		Profiles: map[string][]string{
 			"test": {"model1", "model2"},
 		},

diff --git a/proxy/config_windows_test.go b/proxy/config_windows_test.go
@@ -193,6 +193,7 @@ groups:
 			},
 		},
 		HealthCheckTimeout: 15,
+		MetricsMaxInMemory: 1000,
 		Profiles: map[string][]string{
 			"test": {"model1", "model2"},
 		},

diff --git a/proxy/events.go b/proxy/events.go
@@ -6,6 +6,7 @@ const ProcessStateChangeEventID = 0x01
 const ChatCompletionStatsEventID = 0x02
 const ConfigFileChangedEventID = 0x03
 const LogDataEventID = 0x04
+const TokenMetricsEventID = 0x05
 
 type ProcessStateChangeEvent struct {
 	ProcessName string

diff --git a/proxy/metrics_middleware.go b/proxy/metrics_middleware.go
@@ -0,0 +1,145 @@
+package proxy
+
+import (
+	"bytes"
+	"io"
+	"net/http"
+	"time"
+
+	"github.com/gin-gonic/gin"
+	"github.com/tidwall/gjson"
+)
+
+// MetricsMiddleware sets up the MetricsResponseWriter for capturing upstream requests
+func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
+	return func(c *gin.Context) {
+		bodyBytes, err := io.ReadAll(c.Request.Body)
+		if err != nil {
+			pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
+			return
+		}
+		c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
+
+		requestedModel := gjson.GetBytes(bodyBytes, "model").String()
+		if requestedModel == "" {
+			pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
+			return
+		}
+		c.Set("ls-requested-model", requestedModel)
+
+		writer := &MetricsResponseWriter{
+			ResponseWriter: c.Writer,
+			metricsRecorder: &MetricsRecorder{
+				metricsMonitor: pm.metricsMonitor,
+				modelName:      requestedModel, // will be updated in proxyOAIHandler
+				isStreaming:    gjson.GetBytes(bodyBytes, "stream").Bool(),
+			},
+		}
+		c.Writer = writer
+		c.Next()
+
+		rec := writer.metricsRecorder
+		rec.processBody(writer.body)
+	}
+}
+
+type MetricsRecorder struct {
+	metricsMonitor *MetricsMonitor
+	modelName      string
+	isStreaming    bool
+	startTime      time.Time
+}
+
+// processBody handles response processing after request completes
+func (rec *MetricsRecorder) processBody(body []byte) {
+	if rec.isStreaming {
+		rec.processStreamingResponse(body)
+	} else {
+		rec.processNonStreamingResponse(body)
+	}
+}
+
+func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) {
+	if !jsonData.Get("usage").Exists() {
+		return
+	}
+
+	outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
+	inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
+
+	if outputTokens > 0 {
+		duration := time.Since(rec.startTime)
+		tokensPerSecond := float64(inputTokens+outputTokens) / duration.Seconds()
+
+		metrics := TokenMetrics{
+			Timestamp:       time.Now(),
+			Model:           rec.modelName,
+			InputTokens:     inputTokens,
+			OutputTokens:    outputTokens,
+			TokensPerSecond: tokensPerSecond,
+			DurationMs:      int(duration.Milliseconds()),
+		}
+		rec.metricsMonitor.addMetrics(metrics)
+	}
+}
+
+func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
+	lines := bytes.Split(body, []byte("\n"))
+	for _, line := range lines {
+		line = bytes.TrimSpace(line)
+		if len(line) == 0 {
+			continue
+		}
+
+		// Check for SSE data prefix
+		if bytes.HasPrefix(line, []byte("data: ")) {
+			data := bytes.TrimSpace(line[6:])
+			if len(data) == 0 {
+				continue
+			}
+			if bytes.Equal(data, []byte("[DONE]")) {
+				break
+			}
+
+			// Parse JSON to look for usage data
+			if gjson.ValidBytes(data) {
+				rec.parseAndRecordMetrics(gjson.ParseBytes(data))
+			}
+		}
+	}
+}
+
+func (rec *MetricsRecorder) processNonStreamingResponse(body []byte) {
+	if len(body) == 0 {
+		return
+	}
+
+	// Parse JSON to extract usage information
+	if gjson.ValidBytes(body) {
+		rec.parseAndRecordMetrics(gjson.ParseBytes(body))
+	}
+}
+
+// MetricsResponseWriter captures the entire response for non-streaming
+type MetricsResponseWriter struct {
+	gin.ResponseWriter
+	body            []byte
+	metricsRecorder *MetricsRecorder
+}
+
+func (w *MetricsResponseWriter) Write(b []byte) (int, error) {
+	n, err := w.ResponseWriter.Write(b)
+	if err != nil {
+		return n, err
+	}
+	w.body = append(w.body, b...)
+	return n, nil
+}
+
+func (w *MetricsResponseWriter) WriteHeader(statusCode int) {
+	w.ResponseWriter.WriteHeader(statusCode)
+}
+
+func (w *MetricsResponseWriter) Header() http.Header {
+	return w.ResponseWriter.Header()
+}
diff --git a/proxy/metrics_monitor.go b/proxy/metrics_monitor.go
@@ -0,0 +1,82 @@
+package proxy
+
+import (
+	"encoding/json"
+	"sync"
+	"time"
+
+	"github.com/mostlygeek/llama-swap/event"
+)
+
+// TokenMetrics represents parsed token statistics from llama-server logs
+type TokenMetrics struct {
+	ID              int       `json:"id"`
+	Timestamp       time.Time `json:"timestamp"`
+	Model           string    `json:"model"`
+	InputTokens     int       `json:"input_tokens"`
+	OutputTokens    int       `json:"output_tokens"`
+	TokensPerSecond float64   `json:"tokens_per_second"`
+	DurationMs      int       `json:"duration_ms"`
+}
+
+// TokenMetricsEvent represents a token metrics event
+type TokenMetricsEvent struct {
+	Metrics TokenMetrics
+}
+
+func (e TokenMetricsEvent) Type() uint32 {
+	return TokenMetricsEventID // defined in events.go
+}
+
+// MetricsMonitor parses llama-server output for token statistics
+type MetricsMonitor struct {
+	mu         sync.RWMutex
+	metrics    []TokenMetrics
+	maxMetrics int
+	nextID     int
+}
+
+func NewMetricsMonitor(config *Config) *MetricsMonitor {
+	maxMetrics := config.MetricsMaxInMemory
+	if maxMetrics <= 0 {
+		maxMetrics = 1000 // Default fallback
+	}
+
+	mp := &MetricsMonitor{
+		maxMetrics: maxMetrics,
+	}
+
+	return mp
+}
+
+// addMetrics adds a new metric to the collection and publishes an event
+func (mp *MetricsMonitor) addMetrics(metric TokenMetrics) {
+	mp.mu.Lock()
+	defer mp.mu.Unlock()
+
+	metric.ID = mp.nextID
+	mp.nextID++
+	mp.metrics = append(mp.metrics, metric)
+	if len(mp.metrics) > mp.maxMetrics {
+		mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
+	}
+
+	event.Emit(TokenMetricsEvent{Metrics: metric})
+}
+
+// GetMetrics returns a copy of the current metrics
+func (mp *MetricsMonitor) GetMetrics() []TokenMetrics {
+	mp.mu.RLock()
+	defer mp.mu.RUnlock()
+
+	result := make([]TokenMetrics, len(mp.metrics))
+	copy(result, mp.metrics)
+	return result
+}
+
+// GetMetricsJSON returns metrics as JSON
+func (mp *MetricsMonitor) GetMetricsJSON() ([]byte, error) {
+	mp.mu.RLock()
+	defer mp.mu.RUnlock()
+	return json.Marshal(mp.metrics)
+}
diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go
@@ -14,7 +14,6 @@ import (
 	"time"
 
 	"github.com/gin-gonic/gin"
-	"github.com/tidwall/gjson"
 	"github.com/tidwall/sjson"
 )
 
@@ -33,6 +32,8 @@ type ProxyManager struct {
 	upstreamLogger *LogMonitor
 	muxLogger      *LogMonitor
 
+	metricsMonitor *MetricsMonitor
+
 	processGroups map[string]*ProcessGroup
 
 	// shutdown signaling
@@ -78,6 +79,8 @@ func New(config Config) *ProxyManager {
 		muxLogger:      stdoutLogger,
 		upstreamLogger: upstreamLogger,
 
+		metricsMonitor: NewMetricsMonitor(&config),
+
 		processGroups: make(map[string]*ProcessGroup),
 
 		shutdownCtx:    shutdownCtx,
@@ -149,10 +152,12 @@ func (pm *ProxyManager) setupGinEngine() {
 		c.Next()
 	})
 
+	mm := MetricsMiddleware(pm)
+
 	// Set up routes using the Gin engine
-	pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/chat/completions", mm, pm.proxyOAIHandler)
 	// Support legacy /v1/completions api, see issue #12
-	pm.ginEngine.POST("/v1/completions", pm.proxyOAIHandler)
+	pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
 
 	// Support embeddings
 	pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
@@ -360,17 +365,13 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
 		return
 	}
 
-	requestedModel := gjson.GetBytes(bodyBytes, "model").String()
-	if requestedModel == "" {
-		pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
-		return
-	}
-
+	requestedModel := c.GetString("ls-requested-model") // Should be set in MetricsMiddleware
 	processGroup, realModelName, err := pm.swapProcessGroup(requestedModel)
 	if err != nil {
 		pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error swapping process group: %s", err.Error()))
 		return
 	}
+	c.Writer.(*MetricsResponseWriter).metricsRecorder.modelName = realModelName
 
 	// issue #69 allow custom model names to be sent to upstream
 	useModelName := pm.config.Models[realModelName].UseModelName