@@ -41,10 +41,10 @@ const (
4141
4242// Session represents a single WebSocket connection and its state
4343type Session struct {
44- ID string
45- TranscriptionOnly bool
44+ ID string
45+ TranscriptionOnly bool
4646 // The pipeline or any-to-any model name (full realtime mode)
47- Model string
47+ Model string
4848 // The voice may be a TTS model name or a parameter passed to a TTS model
4949 Voice string
5050 TurnDetection * types.TurnDetectionUnion // "server_vad", "semantic_vad" or "none"
@@ -58,7 +58,7 @@ type Session struct {
5858 DefaultConversationID string
5959 ModelInterface Model
6060 // The pipeline model config or the config for an any-to-any model
61- ModelConfig * config.ModelConfig
61+ ModelConfig * config.ModelConfig
6262}
6363
6464func (s * Session ) FromClient (session * types.SessionUnion ) {
@@ -121,8 +121,9 @@ var sessionLock sync.Mutex
121121type Model interface {
122122 VAD (ctx context.Context , request * schema.VADRequest ) (* schema.VADResponse , error )
123123 Transcribe (ctx context.Context , audio , language string , translate bool , diarize bool , prompt string ) (* schema.TranscriptionResult , error )
124- Predict (ctx context.Context , messages schema.Messages , images , videos , audios []string , tokenCallback func (string , backend.TokenUsage ) bool , tools string , toolChoice string , logprobs * int , topLogprobs * int , logitBias map [string ]float64 ) (func () (backend.LLMResponse , error ), error )
124+ Predict (ctx context.Context , messages schema.Messages , images , videos , audios []string , tokenCallback func (string , backend.TokenUsage ) bool , tools []types. ToolUnion , toolChoice * types. ToolChoiceUnion , logprobs * int , topLogprobs * int , logitBias map [string ]float64 ) (func () (backend.LLMResponse , error ), error )
125125 TTS (ctx context.Context , text , voice , language string ) (string , * proto.Result , error )
126+ PredictConfig () * config.ModelConfig
126127}
127128
128129var upgrader = websocket.Upgrader {
@@ -765,7 +766,7 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
765766 }
766767
767768 if ! session .TranscriptionOnly {
768- generateResponse (session . ModelConfig , session , utt , transcript , conv , c , websocket .TextMessage )
769+ generateResponse (session , utt , transcript , conv , c , websocket .TextMessage )
769770 }
770771}
771772
@@ -790,9 +791,11 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
790791}
791792
792793// Function to generate a response based on the conversation
793- func generateResponse (config * config. ModelConfig , session * Session , utt []byte , transcript string , conv * Conversation , c * websocket.Conn , mt int ) {
794+ func generateResponse (session * Session , utt []byte , transcript string , conv * Conversation , c * websocket.Conn , mt int ) {
794795 xlog .Debug ("Generating realtime response..." )
795796
797+ config := session .ModelInterface .PredictConfig ()
798+
796799 item := types.MessageItemUnion {
797800 User : & types.MessageItemUser {
798801 ID : generateItemID (),
@@ -881,19 +884,7 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
881884 },
882885 })
883886
884- toolsJSON := ""
885- if len (session .Tools ) > 0 {
886- b , _ := json .Marshal (session .Tools )
887- toolsJSON = string (b )
888- }
889-
890- toolChoiceJSON := ""
891- if session .ToolChoice != nil {
892- b , _ := json .Marshal (session .ToolChoice )
893- toolChoiceJSON = string (b )
894- }
895-
896- predFunc , err := session .ModelInterface .Predict (context .TODO (), conversationHistory , nil , nil , nil , nil , toolsJSON , toolChoiceJSON , nil , nil , nil )
887+ predFunc , err := session .ModelInterface .Predict (context .TODO (), conversationHistory , nil , nil , nil , nil , session .Tools , session .ToolChoice , nil , nil , nil )
897888 if err != nil {
898889 sendError (c , "inference_failed" , fmt .Sprintf ("backend error: %v" , err ), "" , item .Assistant .ID )
899890 return
@@ -902,8 +893,11 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
902893 pred , err := predFunc ()
903894 if err != nil {
904895 sendError (c , "prediction_failed" , fmt .Sprintf ("backend error: %v" , err ), "" , item .Assistant .ID )
896+ return
905897 }
906898
899+ xlog .Debug ("Function config for parsing" , "function_name_key" , config .FunctionsConfig .FunctionNameKey , "function_arguments_key" , config .FunctionsConfig .FunctionArgumentsKey )
900+
907901 rawResponse := pred .Response
908902 if config .TemplateConfig .ReplyPrefix != "" {
909903 rawResponse = config .TemplateConfig .ReplyPrefix + rawResponse
@@ -916,6 +910,8 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
916910 cleanedResponse := functions .CleanupLLMResult (responseWithoutReasoning , config .FunctionsConfig )
917911 toolCalls := functions .ParseFunctionCall (cleanedResponse , config .FunctionsConfig )
918912
913+ xlog .Debug ("Function call parsing" , "textContent" , textContent , "cleanedResponse" , cleanedResponse , "toolCallsCount" , len (toolCalls ))
914+
919915 noActionName := "answer"
920916 if config .FunctionsConfig .NoActionFunctionName != "" {
921917 noActionName = config .FunctionsConfig .NoActionFunctionName
@@ -932,15 +928,23 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
932928 if m , exists := arguments ["message" ]; exists {
933929 if message , ok := m .(string ); ok {
934930 finalSpeech = message
931+ } else {
932+ xlog .Warn ("NoAction function message field is not a string" , "type" , fmt .Sprintf ("%T" , m ))
935933 }
934+ } else {
935+ xlog .Warn ("NoAction function missing 'message' field in arguments" )
936936 }
937+ } else {
938+ xlog .Warn ("Failed to unmarshal NoAction function arguments" , "error" , err , "arguments" , arg )
937939 }
938940 if finalSpeech == "" {
939941 // Fallback if parsing failed
942+ xlog .Warn ("NoAction function did not produce speech, using cleaned response as fallback" )
940943 finalSpeech = cleanedResponse
941944 }
942945 } else {
943946 finalToolCalls = toolCalls
947+ xlog .Debug ("Setting finalToolCalls" , "count" , len (finalToolCalls ))
944948 if len (toolCalls ) > 0 {
945949 finalSpeech = textContent
946950 } else {
@@ -1060,6 +1064,7 @@ func generateResponse(config *config.ModelConfig, session *Session, utt []byte,
10601064 }
10611065
10621066 // Handle Tool Calls
1067+ xlog .Debug ("About to handle tool calls" , "finalToolCallsCount" , len (finalToolCalls ))
10631068 for i , tc := range finalToolCalls {
10641069 toolCallID := generateItemID ()
10651070 callID := "call_" + generateUniqueID () // OpenAI uses call_xyz
0 commit comments