From 7e6d9bce6467f6f27d694ca20821d083c6b1e49c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:38:51 +0000 Subject: [PATCH 1/8] Initial plan From 9607ee21db1191bcd52aa1a40c66ec31f3a470fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:42:53 +0000 Subject: [PATCH 2/8] Initial analysis: Add OpenAI speech API endpoint linked to Gemini TTS Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- package-lock.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index 5545433..161931c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,7 +8,7 @@ "@whatwg-node/server": "0.9" }, "devDependencies": { - "nodemon": "^3.1.7" + "nodemon": "3" } }, "node_modules/@kamilkisiela/fast-url-parser": { From cd058944544b8d13de9db2e02042f6caf8faa2b4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:51:22 +0000 Subject: [PATCH 3/8] Add OpenAI speech API endpoint with Gemini TTS integration Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- README.md | 17 +++++ src/worker.mjs | 164 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) diff --git a/README.md b/README.md index 8ceec43..608b876 100644 --- a/README.md +++ b/README.md @@ -194,3 +194,20 @@ For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-ap - [x] `embeddings` - [x] `dimensions` - [x] `models` +- [x] `audio/speech` (Text-to-Speech) +
+ + - [x] `model` + - `tts-1` => `gemini-2.5-flash-preview-tts` + - `tts-1-hd` => `gemini-2.5-pro-preview-tts` + - Can also specify Gemini model names directly + - [x] `input` (required) + - [x] `voice` (required) + - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` + - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede + - [x] `response_format` + - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` + - Default: `mp3` + - [ ] `speed` (not yet implemented) + +
diff --git a/src/worker.mjs b/src/worker.mjs index 000dab0..8b4b85d 100644 --- a/src/worker.mjs +++ b/src/worker.mjs @@ -31,6 +31,10 @@ export default { assert(request.method === "GET"); return handleModels(apiKey) .catch(errHandler); + case pathname.endsWith("/audio/speech"): + assert(request.method === "POST"); + return handleSpeech(await request.json(), apiKey) + .catch(errHandler); default: throw new HttpError("404 Not Found", 404); } @@ -142,6 +146,166 @@ async function handleEmbeddings (req, apiKey) { return new Response(body, fixCors(response)); } +const DEFAULT_SPEECH_MODEL = "gemini-2.5-flash-preview-tts"; +// Map OpenAI voices to Gemini TTS voices +// OpenAI: alloy, echo, fable, onyx, nova, shimmer +// Gemini: Puck, Charon, Kore, Fenrir, Aoede, and many more +const VOICE_MAP = { + "alloy": "Puck", // Neutral, balanced + "echo": "Charon", // Male voice + "fable": "Kore", // Expressive + "onyx": "Fenrir", // Deep, authoritative + "nova": "Aoede", // Warm, friendly + "shimmer": "Aoede", // Similar to nova +}; +async function handleSpeech (req, apiKey) { + // Map OpenAI model names to Gemini TTS models + let model; + switch (true) { + case typeof req.model !== "string": + model = DEFAULT_SPEECH_MODEL; + break; + case req.model.startsWith("models/"): + model = req.model.substring(7); + break; + case req.model.startsWith("gemini-"): + model = req.model; + break; + case req.model === "tts-1": + model = DEFAULT_SPEECH_MODEL; + break; + case req.model === "tts-1-hd": + model = "gemini-2.5-pro-preview-tts"; + break; + default: + model = DEFAULT_SPEECH_MODEL; + } + + if (!req.input) { + throw new HttpError("input is required", 400); + } + if (!req.voice) { + throw new HttpError("voice is required", 400); + } + + // Map OpenAI voice to Gemini voice + const geminiVoice = VOICE_MAP[req.voice] || "Puck"; + + // Build Gemini request + const geminiRequest = { + contents: [{ + parts: [{ text: req.input }] + }], + generationConfig: { + responseModalities: ["AUDIO"], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { + voiceName: geminiVoice + } + } + } + } + }; + + // Call Gemini API + const url = `${BASE_URL}/${API_VERSION}/models/${model}:generateContent`; + const response = await fetch(url, { + method: "POST", + headers: makeHeaders(apiKey, { "Content-Type": "application/json" }), + body: JSON.stringify(geminiRequest), + }); + + if (!response.ok) { + const errorText = await response.text(); + console.error("Gemini API error:", errorText); + return new Response(errorText, fixCors(response)); + } + + const geminiResponse = JSON.parse(await response.text()); + + // Extract audio data from Gemini response + if (!geminiResponse.candidates?.[0]?.content?.parts?.[0]?.inlineData) { + throw new HttpError("No audio data in response", 500); + } + + const audioData = geminiResponse.candidates[0].content.parts[0].inlineData; + const audioBytes = Buffer.from(audioData.data, "base64"); + + // Convert response format if needed + const responseFormat = req.response_format || "mp3"; + let outputBuffer = audioBytes; + let mimeType = "audio/mpeg"; + + // Gemini returns PCM by default, we need to convert to the requested format + // For now, we'll handle basic formats + switch (responseFormat) { + case "mp3": + mimeType = "audio/mpeg"; + // TODO: Convert PCM to MP3 if needed + // For now, return as-is since Gemini may support different output formats + break; + case "opus": + mimeType = "audio/opus"; + break; + case "aac": + mimeType = "audio/aac"; + break; + case "flac": + mimeType = "audio/flac"; + break; + case "wav": + mimeType = "audio/wav"; + // Convert PCM to WAV format + outputBuffer = convertPCMToWAV(audioBytes); + break; + case "pcm": + mimeType = "audio/pcm"; + break; + default: + mimeType = "audio/mpeg"; + } + + return new Response(outputBuffer, { + headers: { + "Content-Type": mimeType, + "Access-Control-Allow-Origin": "*", + } + }); +} + +// Helper function to convert PCM to WAV format +function convertPCMToWAV(pcmData) { + const sampleRate = 24000; + const numChannels = 1; + const bitsPerSample = 16; + + const dataSize = pcmData.length; + const buffer = Buffer.alloc(44 + dataSize); + + // RIFF header + buffer.write("RIFF", 0); + buffer.writeUInt32LE(36 + dataSize, 4); + buffer.write("WAVE", 8); + + // fmt chunk + buffer.write("fmt ", 12); + buffer.writeUInt32LE(16, 16); // fmt chunk size + buffer.writeUInt16LE(1, 20); // PCM format + buffer.writeUInt16LE(numChannels, 22); + buffer.writeUInt32LE(sampleRate, 24); + buffer.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28); // byte rate + buffer.writeUInt16LE(numChannels * bitsPerSample / 8, 32); // block align + buffer.writeUInt16LE(bitsPerSample, 34); + + // data chunk + buffer.write("data", 36); + buffer.writeUInt32LE(dataSize, 40); + pcmData.copy(buffer, 44); + + return buffer; +} + const DEFAULT_MODEL = "gemini-flash-latest"; async function handleCompletions (req, apiKey) { let model; From 05d73669a600892fe094951c190069d20ef03dae Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 12 Nov 2025 09:52:52 +0000 Subject: [PATCH 4/8] Add TTS usage documentation and examples to README Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/README.md b/README.md index 608b876..02d249e 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,39 @@ The most notable of these is [`thinking_config`](https://ai.google.dev/gemini-ap For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-api/docs/openai#extra-body). + +## Text-to-Speech (TTS) + +The `/v1/audio/speech` endpoint provides OpenAI-compatible text-to-speech functionality powered by Gemini's TTS models. + +### Example Usage + +```bash +curl https://your-endpoint.com/v1/audio/speech \ + -H "Authorization: Bearer YOUR_GEMINI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "tts-1", + "input": "Hello! This is a test of the text-to-speech API.", + "voice": "alloy", + "response_format": "mp3" + }' \ + --output speech.mp3 +``` + +### Model Mapping +- `tts-1` → `gemini-2.5-flash-preview-tts` (faster, optimized for speed) +- `tts-1-hd` → `gemini-2.5-pro-preview-tts` (higher quality) + +### Voice Mapping +OpenAI voices are mapped to Gemini TTS voices: +- `alloy` → Puck (neutral, balanced) +- `echo` → Charon (male voice) +- `fable` → Kore (expressive) +- `onyx` → Fenrir (deep, authoritative) +- `nova` → Aoede (warm, friendly) +- `shimmer` → Aoede (similar to nova) + --- ## Supported API endpoints and applicable parameters From f2d39aa653d8001cf97db36b6c292d98c8eed364 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 13 Nov 2025 01:48:40 +0000 Subject: [PATCH 5/8] Initial plan From 7515a7860be0c3e37fd7add858d4ff65cb745d00 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 13 Nov 2025 01:51:54 +0000 Subject: [PATCH 6/8] Fix audio format conversion - limit to WAV and PCM only Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- README.md | 9 +++++---- src/worker.mjs | 38 +++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 02d249e..cb6cfea 100644 --- a/README.md +++ b/README.md @@ -161,9 +161,9 @@ curl https://your-endpoint.com/v1/audio/speech \ "model": "tts-1", "input": "Hello! This is a test of the text-to-speech API.", "voice": "alloy", - "response_format": "mp3" + "response_format": "wav" }' \ - --output speech.mp3 + --output speech.wav ``` ### Model Mapping @@ -239,8 +239,9 @@ OpenAI voices are mapped to Gemini TTS voices: - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede - [x] `response_format` - - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm` - - Default: `mp3` + - Supported: `wav`, `pcm` + - Default: `wav` + - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg. - [ ] `speed` (not yet implemented) diff --git a/src/worker.mjs b/src/worker.mjs index 8b4b85d..6e50841 100644 --- a/src/worker.mjs +++ b/src/worker.mjs @@ -233,27 +233,13 @@ async function handleSpeech (req, apiKey) { const audioBytes = Buffer.from(audioData.data, "base64"); // Convert response format if needed - const responseFormat = req.response_format || "mp3"; + // Gemini returns raw PCM audio data (24 kHz, 16-bit, mono) + // Only WAV and PCM formats are currently supported + const responseFormat = req.response_format || "wav"; let outputBuffer = audioBytes; - let mimeType = "audio/mpeg"; + let mimeType; - // Gemini returns PCM by default, we need to convert to the requested format - // For now, we'll handle basic formats switch (responseFormat) { - case "mp3": - mimeType = "audio/mpeg"; - // TODO: Convert PCM to MP3 if needed - // For now, return as-is since Gemini may support different output formats - break; - case "opus": - mimeType = "audio/opus"; - break; - case "aac": - mimeType = "audio/aac"; - break; - case "flac": - mimeType = "audio/flac"; - break; case "wav": mimeType = "audio/wav"; // Convert PCM to WAV format @@ -261,9 +247,23 @@ async function handleSpeech (req, apiKey) { break; case "pcm": mimeType = "audio/pcm"; + // Return raw PCM data break; + case "mp3": + case "opus": + case "aac": + case "flac": + throw new HttpError( + `Format "${responseFormat}" is not supported. Gemini returns raw PCM audio. ` + + `Only "wav" and "pcm" formats are supported. ` + + `For other formats, use external conversion tools like ffmpeg.`, + 400 + ); default: - mimeType = "audio/mpeg"; + throw new HttpError( + `Unknown response_format: "${responseFormat}". Supported formats: "wav", "pcm"`, + 400 + ); } return new Response(outputBuffer, { From 2786f6d0bceca4a43976dae27599e008a4195d47 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 13 Nov 2025 09:07:39 +0000 Subject: [PATCH 7/8] Initial plan From 2aa8d0728dce8ff5faca89f6d459fc33daa89176 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 13 Nov 2025 09:09:48 +0000 Subject: [PATCH 8/8] Fix markdown list indentation violations in README.md Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index cb6cfea..c8abe22 100644 --- a/README.md +++ b/README.md @@ -231,17 +231,17 @@ OpenAI voices are mapped to Gemini TTS voices:
- [x] `model` - - `tts-1` => `gemini-2.5-flash-preview-tts` - - `tts-1-hd` => `gemini-2.5-pro-preview-tts` - - Can also specify Gemini model names directly + - `tts-1` => `gemini-2.5-flash-preview-tts` + - `tts-1-hd` => `gemini-2.5-pro-preview-tts` + - Can also specify Gemini model names directly - [x] `input` (required) - [x] `voice` (required) - - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` - - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede + - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer` + - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede - [x] `response_format` - - Supported: `wav`, `pcm` - - Default: `wav` - - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg. + - Supported: `wav`, `pcm` + - Default: `wav` + - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg. - [ ] `speed` (not yet implemented)