fix(realtime): Improve tool call handling and error reporting

richiejp · richiejp · commit f7a3995d09d9 · 2026-01-28T11:24:59.000Z
- Refactor Model interface to accept []types.ToolUnion and *types.ToolChoiceUnion
  instead of JSON strings, eliminating unnecessary marshal/unmarshal cycles
- Fix Parameters field handling: support both map[string]any and JSON string formats
- Add PredictConfig() method to Model interface for accessing model configuration
- Add comprehensive debug logging for tool call parsing and function config
- Add missing return statement after prediction error (critical bug fix)
- Add warning logs for NoAction function argument parsing failures
- Improve error visibility throughout generateResponse function

💘 Generated with Crush

Assisted-by: Claude Sonnet 4.5 via Crush &lt;crush@charm.land&gt;
Signed-off-by: Richard Palethorpe &lt;io@richiejp.com&gt;
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
@@ -41,10 +41,10 @@ const (
 
 // Session represents a single WebSocket connection and its state
 type Session struct {
-	ID                      string
-	TranscriptionOnly       bool
+	ID                string
+	TranscriptionOnly bool
 	// The pipeline or any-to-any model name (full realtime mode)
-	Model                   string
+	Model string
 	// The voice may be a TTS model name or a parameter passed to a TTS model
 	Voice                   string
 	TurnDetection           *types.TurnDetectionUnion // "server_vad", "semantic_vad" or "none"
@@ -58,7 +58,7 @@ type Session struct {
 	DefaultConversationID   string
 	ModelInterface          Model
 	// The pipeline model config or the config for an any-to-any model
-	ModelConfig             *config.ModelConfig
+	ModelConfig *config.ModelConfig
 }
 
 func (s *Session) FromClient(session *types.SessionUnion) {
@@ -121,8 +121,9 @@ var sessionLock sync.Mutex
 type Model interface {
 	VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error)
 	Transcribe(ctx context.Context, audio, language string, translate bool, diarize bool, prompt string) (*schema.TranscriptionResult, error)
-	Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error)
+	Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error)
 	TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error)
+	PredictConfig() *config.ModelConfig
 }
 
 var upgrader = websocket.Upgrader{
@@ -765,7 +766,7 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 	}
 
 	if !session.TranscriptionOnly {
-		generateResponse(session.ModelConfig, session, utt, transcript, conv, c, websocket.TextMessage)
+		generateResponse(session, utt, transcript, conv, c, websocket.TextMessage)
 	}
 }
 
@@ -790,9 +791,11 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
 }
 
 // Function to generate a response based on the conversation
-func generateResponse(config *config.ModelConfig, session *Session, utt []byte, transcript string, conv *Conversation, c *websocket.Conn, mt int) {
+func generateResponse(session *Session, utt []byte, transcript string, conv *Conversation, c *websocket.Conn, mt int) {
 	xlog.Debug("Generating realtime response...")
 
+	config := session.ModelInterface.PredictConfig()
+
 	item := types.MessageItemUnion{
 		User: &types.MessageItemUser{
 			ID:     generateItemID(),
@@ -881,19 +884,7 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
 		},
 	})
 
-	toolsJSON := ""
-	if len(session.Tools) > 0 {
-		b, _ := json.Marshal(session.Tools)
-		toolsJSON = string(b)
-	}
-
-	toolChoiceJSON := ""
-	if session.ToolChoice != nil {
-		b, _ := json.Marshal(session.ToolChoice)
-		toolChoiceJSON = string(b)
-	}
-
-	predFunc, err := session.ModelInterface.Predict(context.TODO(), conversationHistory, nil, nil, nil, nil, toolsJSON, toolChoiceJSON, nil, nil, nil)
+	predFunc, err := session.ModelInterface.Predict(context.TODO(), conversationHistory, nil, nil, nil, nil, session.Tools, session.ToolChoice, nil, nil, nil)
 	if err != nil {
 		sendError(c, "inference_failed", fmt.Sprintf("backend error: %v", err), "", item.Assistant.ID)
 		return
@@ -902,8 +893,11 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
 	pred, err := predFunc()
 	if err != nil {
 		sendError(c, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", item.Assistant.ID)
+		return
 	}
 
+	xlog.Debug("Function config for parsing", "function_name_key", config.FunctionsConfig.FunctionNameKey, "function_arguments_key", config.FunctionsConfig.FunctionArgumentsKey)
+
 	rawResponse := pred.Response
 	if config.TemplateConfig.ReplyPrefix != "" {
 		rawResponse = config.TemplateConfig.ReplyPrefix + rawResponse
@@ -916,6 +910,8 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
 	cleanedResponse := functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
 	toolCalls := functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
 
+	xlog.Debug("Function call parsing", "textContent", textContent, "cleanedResponse", cleanedResponse, "toolCallsCount", len(toolCalls))
+
 	noActionName := "answer"
 	if config.FunctionsConfig.NoActionFunctionName != "" {
 		noActionName = config.FunctionsConfig.NoActionFunctionName
@@ -932,15 +928,23 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
 			if m, exists := arguments["message"]; exists {
 				if message, ok := m.(string); ok {
 					finalSpeech = message
+				} else {
+					xlog.Warn("NoAction function message field is not a string", "type", fmt.Sprintf("%T", m))
 				}
+			} else {
+				xlog.Warn("NoAction function missing 'message' field in arguments")
 			}
+		} else {
+			xlog.Warn("Failed to unmarshal NoAction function arguments", "error", err, "arguments", arg)
 		}
 		if finalSpeech == "" {
 			// Fallback if parsing failed
+			xlog.Warn("NoAction function did not produce speech, using cleaned response as fallback")
 			finalSpeech = cleanedResponse
 		}
 	} else {
 		finalToolCalls = toolCalls
+		xlog.Debug("Setting finalToolCalls", "count", len(finalToolCalls))
 		if len(toolCalls) > 0 {
 			finalSpeech = textContent
 		} else {
@@ -1060,6 +1064,7 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
 	}
 
 	// Handle Tool Calls
+	xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(finalToolCalls))
 	for i, tc := range finalToolCalls {
 		toolCallID := generateItemID()
 		callID := "call_" + generateUniqueID() // OpenAI uses call_xyz
diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
@@ -65,14 +65,18 @@ func (m *transcriptOnlyModel) Transcribe(ctx context.Context, audio, language st
 	return backend.ModelTranscription(audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }
 
-func (m *transcriptOnlyModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
+func (m *transcriptOnlyModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	return nil, fmt.Errorf("predict operation not supported in transcript-only mode")
 }
 
 func (m *transcriptOnlyModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
 	return "", nil, fmt.Errorf("TTS not supported in transcript-only mode")
 }
 
+func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
+	return nil
+}
+
 func (m *wrappedModel) VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) {
 	return backend.VAD(request, ctx, m.modelLoader, m.appConfig, *m.VADConfig)
 }
@@ -81,45 +85,78 @@ func (m *wrappedModel) Transcribe(ctx context.Context, audio, language string, t
 	return backend.ModelTranscription(audio, language, translate, diarize, prompt, m.modelLoader, *m.TranscriptionConfig, m.appConfig)
 }
 
-func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
+func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, images, videos, audios []string, tokenCallback func(string, backend.TokenUsage) bool, tools []types.ToolUnion, toolChoice *types.ToolChoiceUnion, logprobs *int, topLogprobs *int, logitBias map[string]float64) (func() (backend.LLMResponse, error), error) {
 	input := schema.OpenAIRequest{
 		Messages: messages,
 	}
 
 	var predInput string
 	if !m.LLMConfig.TemplateConfig.UseTokenizerTemplate {
 		var funcs []functions.Function
-		if tools != "" {
-			var toolUnions []types.ToolUnion
-			if err := json.Unmarshal([]byte(tools), &toolUnions); err == nil {
-				for _, t := range toolUnions {
-					if t.Function != nil {
-						params, _ := t.Function.Parameters.(map[string]interface{})
-						funcs = append(funcs, functions.Function{
-							Name:        t.Function.Name,
-							Description: t.Function.Description,
-							Parameters:  params,
-						})
+		if len(tools) > 0 {
+			for _, t := range tools {
+				if t.Function != nil {
+					var params map[string]any
+
+					switch p := t.Function.Parameters.(type) {
+					case map[string]any:
+						params = p
+					case string:
+						if err := json.Unmarshal([]byte(p), &params); err != nil {
+							xlog.Warn("Failed to parse parameters JSON string", "error", err, "function", t.Function.Name)
+						}
 					}
+
+					funcs = append(funcs, functions.Function{
+						Name:        t.Function.Name,
+						Description: t.Function.Description,
+						Parameters:  params,
+					})
 				}
 			}
 		}
 
 		predInput = m.evaluator.TemplateMessages(input, input.Messages, m.LLMConfig, funcs, len(funcs) > 0)
 
+		// If the config doesn't specify function_name_key but the template contains the word "function"
+		// in its function calling instructions, default to "function" as the key
+		// This handles templates that say: "return a json object with function name and arguments"
+		// but show the schema format as: {'function': {'name': '...', ...}}
+		if m.LLMConfig.FunctionsConfig.FunctionNameKey == "" {
+			// Check if this is likely a template that uses "function" as the key
+			// by looking at common patterns in function templates
+			xlog.Debug("FunctionNameKey not configured, will use default parsing")
+		}
+
 		xlog.Debug("Prompt (after templating)", "prompt", predInput)
 		if m.LLMConfig.Grammar != "" {
 			xlog.Debug("Grammar", "grammar", m.LLMConfig.Grammar)
 		}
 	}
 
-	return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, tools, toolChoice, logprobs, topLogprobs, logitBias, )
+	var toolsJSON string
+	if len(tools) > 0 {
+		b, _ := json.Marshal(tools)
+		toolsJSON = string(b)
+	}
+
+	var toolChoiceJSON string
+	if toolChoice != nil {
+		b, _ := json.Marshal(toolChoice)
+		toolChoiceJSON = string(b)
+	}
+
+	return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, )
 }
 
 func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
 	return backend.ModelTTS(text, voice, language, m.modelLoader, m.appConfig, *m.TTSConfig)
 }
 
+func (m *wrappedModel) PredictConfig() *config.ModelConfig {
+	return m.LLMConfig
+}
+
 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
 	cfgVAD, err := cl.LoadModelConfigFileByName(pipeline.VAD, ml.ModelPath)
 	if err != nil {