From 8aff1612481649bc60157098088cf23a3ab4a871 Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien-nc@posteo.net>
Date: Thu, 4 Dec 2025 15:12:55 +0100
Subject: [PATCH] feat(audio-chat): detect audio input format when sending it
 as a message to the chat completion endpoint, support wav and mp3

Signed-off-by: Julien Veyssier <julien-nc@posteo.net>
---
 lib/Service/OpenAiAPIService.php                |  4 +++-
 lib/TaskProcessing/AudioToAudioChatProvider.php | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/lib/Service/OpenAiAPIService.php b/lib/Service/OpenAiAPIService.php
index b8e79328..a7652aec 100644
--- a/lib/Service/OpenAiAPIService.php
+++ b/lib/Service/OpenAiAPIService.php
@@ -490,6 +490,7 @@ public function createCompletion(
 	 * @param string|null $toolMessage JSON string with role, content, tool_call_id
 	 * @param array|null $tools
 	 * @param string|null $userAudioPromptBase64
+	 * @param string|null $userAudioPromptFormat
 	 * @return array{messages: array<string>, tool_calls: array<string>, audio_messages: list<array<string, mixed>>}
 	 * @throws Exception
 	 */
@@ -505,6 +506,7 @@ public function createChatCompletion(
 		?string $toolMessage = null,
 		?array $tools = null,
 		?string $userAudioPromptBase64 = null,
+		?string $userAudioPromptFormat = null,
 	): array {
 		if ($this->isQuotaExceeded($userId, Application::QUOTA_TYPE_TEXT)) {
 			throw new Exception($this->l10n->t('Text generation quota exceeded'), Http::STATUS_TOO_MANY_REQUESTS);
@@ -560,7 +562,7 @@ public function createChatCompletion(
 						'type' => 'input_audio',
 						'input_audio' => [
 							'data' => $userAudioPromptBase64,
-							'format' => 'wav',
+							'format' => $userAudioPromptFormat ?? 'wav',
 						],
 					],
 				],
diff --git a/lib/TaskProcessing/AudioToAudioChatProvider.php b/lib/TaskProcessing/AudioToAudioChatProvider.php
index 4cbe3ac3..9e0502c0 100644
--- a/lib/TaskProcessing/AudioToAudioChatProvider.php
+++ b/lib/TaskProcessing/AudioToAudioChatProvider.php
@@ -25,6 +25,15 @@
 
 class AudioToAudioChatProvider implements ISynchronousProvider {
 
+	// OpenAI supports wav and mp3
+	// https://platform.openai.com/docs/api-reference/chat/create#chat-create-messages
+	private const SUPPORTED_INPUT_AUDIO_FORMATS = [
+		'audio/mp3' => 'mp3',
+		'audio/mpeg' => 'mp3',
+		'audio/wav' => 'wav',
+		'audio/x-wav' => 'wav',
+	];
+
 	public function __construct(
 		private OpenAiAPIService $openAiAPIService,
 		private IL10N $l,
@@ -213,6 +222,8 @@ private function oneStep(
 		string $sttModel, string $llmModel, string $ttsModel, float $speed, string $serviceName,
 	): array {
 		$result = [];
+		$audioInputMimetype = mime_content_type($inputFile->fopen('rb'));
+		$audioInputFormat = self::SUPPORTED_INPUT_AUDIO_FORMATS[$audioInputMimetype] ?? 'wav';
 		$b64Audio = base64_encode($inputFile->getContent());
 		$extraParams = [
 			'modalities' => ['text', 'audio'],
@@ -221,7 +232,7 @@ private function oneStep(
 		$systemPrompt .= ' Producing text responses will break the user interface. Important: You have multimodal voice capability, and you use voice exclusively to respond.';
 		$completion = $this->openAiAPIService->createChatCompletion(
 			$userId, $llmModel, null, $systemPrompt, $history, 1, 1000,
-			$extraParams, null, null, $b64Audio,
+			$extraParams, null, null, $b64Audio, $audioInputFormat
 		);
 		$message = array_pop($completion['audio_messages']);
 		// TODO find a way to force the model to answer with audio when there is only text in the history