From 8aff1612481649bc60157098088cf23a3ab4a871 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Thu, 4 Dec 2025 15:12:55 +0100 Subject: [PATCH] feat(audio-chat): detect audio input format when sending it as a message to the chat completion endpoint, support wav and mp3 Signed-off-by: Julien Veyssier --- lib/Service/OpenAiAPIService.php | 4 +++- lib/TaskProcessing/AudioToAudioChatProvider.php | 13 ++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php index b8e79328..a7652aec 100644 --- a/lib/Service/OpenAiAPIService.php +++ b/lib/Service/OpenAiAPIService.php @@ -490,6 +490,7 @@ public function createCompletion( * @param string|null $toolMessage JSON string with role, content, tool_call_id * @param array|null $tools * @param string|null $userAudioPromptBase64 + * @param string|null $userAudioPromptFormat * @return array{messages: array, tool_calls: array, audio_messages: list>} * @throws Exception */ @@ -505,6 +506,7 @@ public function createChatCompletion( ?string $toolMessage = null, ?array $tools = null, ?string $userAudioPromptBase64 = null, + ?string $userAudioPromptFormat = null, ): array { if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TEXT)) { throw new Exception($this->l10n->t('Text generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS); @@ -560,7 +562,7 @@ public function createChatCompletion( 'type' => 'input_audio', 'input_audio' => [ 'data' => $userAudioPromptBase64, - 'format' => 'wav', + 'format' => $userAudioPromptFormat ?? 'wav', ], ], ], diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php index 4cbe3ac3..9e0502c0 100644 --- a/lib/TaskProcessing/AudioToAudioChatProvider.php +++ b/lib/TaskProcessing/AudioToAudioChatProvider.php @@ -25,6 +25,15 @@ class AudioToAudioChatProvider implements ISynchronousProvider { + // OpenAI supports wav and mp3 + // https://platform.openai.com/docs/api-reference/chat/create#chat-create-messages + private const SUPPORTED_INPUT_AUDIO_FORMATS = [ + 'audio/mp3' => 'mp3', + 'audio/mpeg' => 'mp3', + 'audio/wav' => 'wav', + 'audio/x-wav' => 'wav', + ]; + public function __construct( private OpenAiAPIService $openAiAPIService, private IL10N $l, @@ -213,6 +222,8 @@ private function oneStep( string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName, ): array { $result = []; + $audioInputMimetype = mime_content_type($inputFile->fopen('rb')); + $audioInputFormat = self::SUPPORTED_INPUT_AUDIO_FORMATS[$audioInputMimetype] ?? 'wav'; $b64Audio = base64_encode($inputFile->getContent()); $extraParams = [ 'modalities' => ['text', 'audio'], @@ -221,7 +232,7 @@ private function oneStep( $systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.'; $completion = $this->openAiAPIService->createChatCompletion( $userId, $llmModel, null, $systemPrompt, $history, 1, 1000, - $extraParams, null, null, $b64Audio, + $extraParams, null, null, $b64Audio, $audioInputFormat ); $message = array_pop($completion['audio_messages']); // TODO find a way to force the model to answer with audio when there is only text in the history