From 7e6d9bce6467f6f27d694ca20821d083c6b1e49c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:38:51 +0000
Subject: [PATCH 1/8] Initial plan
From 9607ee21db1191bcd52aa1a40c66ec31f3a470fd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:42:53 +0000
Subject: [PATCH 2/8] Initial analysis: Add OpenAI speech API endpoint linked
to Gemini TTS
Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
package-lock.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/package-lock.json b/package-lock.json
index 5545433..161931c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -8,7 +8,7 @@
"@whatwg-node/server": "0.9"
},
"devDependencies": {
- "nodemon": "^3.1.7"
+ "nodemon": "3"
}
},
"node_modules/@kamilkisiela/fast-url-parser": {
From cd058944544b8d13de9db2e02042f6caf8faa2b4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:51:22 +0000
Subject: [PATCH 3/8] Add OpenAI speech API endpoint with Gemini TTS
integration
Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
README.md | 17 +++++
src/worker.mjs | 164 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 181 insertions(+)
diff --git a/README.md b/README.md
index 8ceec43..608b876 100644
--- a/README.md
+++ b/README.md
@@ -194,3 +194,20 @@ For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-ap
- [x] `embeddings`
- [x] `dimensions`
- [x] `models`
+- [x] `audio/speech` (Text-to-Speech)
+
+
+ - [x] `model`
+ - `tts-1` => `gemini-2.5-flash-preview-tts`
+ - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
+ - Can also specify Gemini model names directly
+ - [x] `input` (required)
+ - [x] `voice` (required)
+ - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
+ - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
+ - [x] `response_format`
+ - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm`
+ - Default: `mp3`
+ - [ ] `speed` (not yet implemented)
+
+
diff --git a/src/worker.mjs b/src/worker.mjs
index 000dab0..8b4b85d 100644
--- a/src/worker.mjs
+++ b/src/worker.mjs
@@ -31,6 +31,10 @@ export default {
assert(request.method === "GET");
return handleModels(apiKey)
.catch(errHandler);
+ case pathname.endsWith("/audio/speech"):
+ assert(request.method === "POST");
+ return handleSpeech(await request.json(), apiKey)
+ .catch(errHandler);
default:
throw new HttpError("404 Not Found", 404);
}
@@ -142,6 +146,166 @@ async function handleEmbeddings (req, apiKey) {
return new Response(body, fixCors(response));
}
+const DEFAULT_SPEECH_MODEL = "gemini-2.5-flash-preview-tts";
+// Map OpenAI voices to Gemini TTS voices
+// OpenAI: alloy, echo, fable, onyx, nova, shimmer
+// Gemini: Puck, Charon, Kore, Fenrir, Aoede, and many more
+const VOICE_MAP = {
+ "alloy": "Puck", // Neutral, balanced
+ "echo": "Charon", // Male voice
+ "fable": "Kore", // Expressive
+ "onyx": "Fenrir", // Deep, authoritative
+ "nova": "Aoede", // Warm, friendly
+ "shimmer": "Aoede", // Similar to nova
+};
+async function handleSpeech (req, apiKey) {
+ // Map OpenAI model names to Gemini TTS models
+ let model;
+ switch (true) {
+ case typeof req.model !== "string":
+ model = DEFAULT_SPEECH_MODEL;
+ break;
+ case req.model.startsWith("models/"):
+ model = req.model.substring(7);
+ break;
+ case req.model.startsWith("gemini-"):
+ model = req.model;
+ break;
+ case req.model === "tts-1":
+ model = DEFAULT_SPEECH_MODEL;
+ break;
+ case req.model === "tts-1-hd":
+ model = "gemini-2.5-pro-preview-tts";
+ break;
+ default:
+ model = DEFAULT_SPEECH_MODEL;
+ }
+
+ if (!req.input) {
+ throw new HttpError("input is required", 400);
+ }
+ if (!req.voice) {
+ throw new HttpError("voice is required", 400);
+ }
+
+ // Map OpenAI voice to Gemini voice
+ const geminiVoice = VOICE_MAP[req.voice] || "Puck";
+
+ // Build Gemini request
+ const geminiRequest = {
+ contents: [{
+ parts: [{ text: req.input }]
+ }],
+ generationConfig: {
+ responseModalities: ["AUDIO"],
+ speechConfig: {
+ voiceConfig: {
+ prebuiltVoiceConfig: {
+ voiceName: geminiVoice
+ }
+ }
+ }
+ }
+ };
+
+ // Call Gemini API
+ const url = `${BASE_URL}/${API_VERSION}/models/${model}:generateContent`;
+ const response = await fetch(url, {
+ method: "POST",
+ headers: makeHeaders(apiKey, { "Content-Type": "application/json" }),
+ body: JSON.stringify(geminiRequest),
+ });
+
+ if (!response.ok) {
+ const errorText = await response.text();
+ console.error("Gemini API error:", errorText);
+ return new Response(errorText, fixCors(response));
+ }
+
+ const geminiResponse = JSON.parse(await response.text());
+
+ // Extract audio data from Gemini response
+ if (!geminiResponse.candidates?.[0]?.content?.parts?.[0]?.inlineData) {
+ throw new HttpError("No audio data in response", 500);
+ }
+
+ const audioData = geminiResponse.candidates[0].content.parts[0].inlineData;
+ const audioBytes = Buffer.from(audioData.data, "base64");
+
+ // Convert response format if needed
+ const responseFormat = req.response_format || "mp3";
+ let outputBuffer = audioBytes;
+ let mimeType = "audio/mpeg";
+
+ // Gemini returns PCM by default, we need to convert to the requested format
+ // For now, we'll handle basic formats
+ switch (responseFormat) {
+ case "mp3":
+ mimeType = "audio/mpeg";
+ // TODO: Convert PCM to MP3 if needed
+ // For now, return as-is since Gemini may support different output formats
+ break;
+ case "opus":
+ mimeType = "audio/opus";
+ break;
+ case "aac":
+ mimeType = "audio/aac";
+ break;
+ case "flac":
+ mimeType = "audio/flac";
+ break;
+ case "wav":
+ mimeType = "audio/wav";
+ // Convert PCM to WAV format
+ outputBuffer = convertPCMToWAV(audioBytes);
+ break;
+ case "pcm":
+ mimeType = "audio/pcm";
+ break;
+ default:
+ mimeType = "audio/mpeg";
+ }
+
+ return new Response(outputBuffer, {
+ headers: {
+ "Content-Type": mimeType,
+ "Access-Control-Allow-Origin": "*",
+ }
+ });
+}
+
+// Helper function to convert PCM to WAV format
+function convertPCMToWAV(pcmData) {
+ const sampleRate = 24000;
+ const numChannels = 1;
+ const bitsPerSample = 16;
+
+ const dataSize = pcmData.length;
+ const buffer = Buffer.alloc(44 + dataSize);
+
+ // RIFF header
+ buffer.write("RIFF", 0);
+ buffer.writeUInt32LE(36 + dataSize, 4);
+ buffer.write("WAVE", 8);
+
+ // fmt chunk
+ buffer.write("fmt ", 12);
+ buffer.writeUInt32LE(16, 16); // fmt chunk size
+ buffer.writeUInt16LE(1, 20); // PCM format
+ buffer.writeUInt16LE(numChannels, 22);
+ buffer.writeUInt32LE(sampleRate, 24);
+ buffer.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28); // byte rate
+ buffer.writeUInt16LE(numChannels * bitsPerSample / 8, 32); // block align
+ buffer.writeUInt16LE(bitsPerSample, 34);
+
+ // data chunk
+ buffer.write("data", 36);
+ buffer.writeUInt32LE(dataSize, 40);
+ pcmData.copy(buffer, 44);
+
+ return buffer;
+}
+
const DEFAULT_MODEL = "gemini-flash-latest";
async function handleCompletions (req, apiKey) {
let model;
From 05d73669a600892fe094951c190069d20ef03dae Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:52:52 +0000
Subject: [PATCH 4/8] Add TTS usage documentation and examples to README
Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
README.md | 33 +++++++++++++++++++++++++++++++++
1 file changed, 33 insertions(+)
diff --git a/README.md b/README.md
index 608b876..02d249e 100644
--- a/README.md
+++ b/README.md
@@ -146,6 +146,39 @@ The most notable of these is [`thinking_config`](https://ai.google.dev/gemini-ap
For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-api/docs/openai#extra-body).
+
+## Text-to-Speech (TTS)
+
+The `/v1/audio/speech` endpoint provides OpenAI-compatible text-to-speech functionality powered by Gemini's TTS models.
+
+### Example Usage
+
+```bash
+curl https://your-endpoint.com/v1/audio/speech \
+ -H "Authorization: Bearer YOUR_GEMINI_API_KEY" \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "tts-1",
+ "input": "Hello! This is a test of the text-to-speech API.",
+ "voice": "alloy",
+ "response_format": "mp3"
+ }' \
+ --output speech.mp3
+```
+
+### Model Mapping
+- `tts-1` → `gemini-2.5-flash-preview-tts` (faster, optimized for speed)
+- `tts-1-hd` → `gemini-2.5-pro-preview-tts` (higher quality)
+
+### Voice Mapping
+OpenAI voices are mapped to Gemini TTS voices:
+- `alloy` → Puck (neutral, balanced)
+- `echo` → Charon (male voice)
+- `fable` → Kore (expressive)
+- `onyx` → Fenrir (deep, authoritative)
+- `nova` → Aoede (warm, friendly)
+- `shimmer` → Aoede (similar to nova)
+
---
## Supported API endpoints and applicable parameters
From f2d39aa653d8001cf97db36b6c292d98c8eed364 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:48:40 +0000
Subject: [PATCH 5/8] Initial plan
From 7515a7860be0c3e37fd7add858d4ff65cb745d00 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:51:54 +0000
Subject: [PATCH 6/8] Fix audio format conversion - limit to WAV and PCM only
Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
README.md | 9 +++++----
src/worker.mjs | 38 +++++++++++++++++++-------------------
2 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/README.md b/README.md
index 02d249e..cb6cfea 100644
--- a/README.md
+++ b/README.md
@@ -161,9 +161,9 @@ curl https://your-endpoint.com/v1/audio/speech \
"model": "tts-1",
"input": "Hello! This is a test of the text-to-speech API.",
"voice": "alloy",
- "response_format": "mp3"
+ "response_format": "wav"
}' \
- --output speech.mp3
+ --output speech.wav
```
### Model Mapping
@@ -239,8 +239,9 @@ OpenAI voices are mapped to Gemini TTS voices:
- Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
- Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
- [x] `response_format`
- - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm`
- - Default: `mp3`
+ - Supported: `wav`, `pcm`
+ - Default: `wav`
+ - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg.
- [ ] `speed` (not yet implemented)
diff --git a/src/worker.mjs b/src/worker.mjs
index 8b4b85d..6e50841 100644
--- a/src/worker.mjs
+++ b/src/worker.mjs
@@ -233,27 +233,13 @@ async function handleSpeech (req, apiKey) {
const audioBytes = Buffer.from(audioData.data, "base64");
// Convert response format if needed
- const responseFormat = req.response_format || "mp3";
+ // Gemini returns raw PCM audio data (24 kHz, 16-bit, mono)
+ // Only WAV and PCM formats are currently supported
+ const responseFormat = req.response_format || "wav";
let outputBuffer = audioBytes;
- let mimeType = "audio/mpeg";
+ let mimeType;
- // Gemini returns PCM by default, we need to convert to the requested format
- // For now, we'll handle basic formats
switch (responseFormat) {
- case "mp3":
- mimeType = "audio/mpeg";
- // TODO: Convert PCM to MP3 if needed
- // For now, return as-is since Gemini may support different output formats
- break;
- case "opus":
- mimeType = "audio/opus";
- break;
- case "aac":
- mimeType = "audio/aac";
- break;
- case "flac":
- mimeType = "audio/flac";
- break;
case "wav":
mimeType = "audio/wav";
// Convert PCM to WAV format
@@ -261,9 +247,23 @@ async function handleSpeech (req, apiKey) {
break;
case "pcm":
mimeType = "audio/pcm";
+ // Return raw PCM data
break;
+ case "mp3":
+ case "opus":
+ case "aac":
+ case "flac":
+ throw new HttpError(
+ `Format "${responseFormat}" is not supported. Gemini returns raw PCM audio. ` +
+ `Only "wav" and "pcm" formats are supported. ` +
+ `For other formats, use external conversion tools like ffmpeg.`,
+ 400
+ );
default:
- mimeType = "audio/mpeg";
+ throw new HttpError(
+ `Unknown response_format: "${responseFormat}". Supported formats: "wav", "pcm"`,
+ 400
+ );
}
return new Response(outputBuffer, {
From 2786f6d0bceca4a43976dae27599e008a4195d47 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:07:39 +0000
Subject: [PATCH 7/8] Initial plan
From 2aa8d0728dce8ff5faca89f6d459fc33daa89176 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:09:48 +0000
Subject: [PATCH 8/8] Fix markdown list indentation violations in README.md
Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
README.md | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/README.md b/README.md
index cb6cfea..c8abe22 100644
--- a/README.md
+++ b/README.md
@@ -231,17 +231,17 @@ OpenAI voices are mapped to Gemini TTS voices:
- [x] `model`
- - `tts-1` => `gemini-2.5-flash-preview-tts`
- - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
- - Can also specify Gemini model names directly
+ - `tts-1` => `gemini-2.5-flash-preview-tts`
+ - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
+ - Can also specify Gemini model names directly
- [x] `input` (required)
- [x] `voice` (required)
- - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
- - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
+ - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
+ - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
- [x] `response_format`
- - Supported: `wav`, `pcm`
- - Default: `wav`
- - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg.
+ - Supported: `wav`, `pcm`
+ - Default: `wav`
+ - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg.
- [ ] `speed` (not yet implemented)