From 7e6d9bce6467f6f27d694ca20821d083c6b1e49c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:38:51 +0000
Subject: [PATCH 1/8] Initial plan


From 9607ee21db1191bcd52aa1a40c66ec31f3a470fd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:42:53 +0000
Subject: [PATCH 2/8] Initial analysis: Add OpenAI speech API endpoint linked
 to Gemini TTS

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 package-lock.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/package-lock.json b/package-lock.json
index 5545433..161931c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -8,7 +8,7 @@
         "@whatwg-node/server": "0.9"
       },
       "devDependencies": {
-        "nodemon": "^3.1.7"
+        "nodemon": "3"
       }
     },
     "node_modules/@kamilkisiela/fast-url-parser": {

From cd058944544b8d13de9db2e02042f6caf8faa2b4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:51:22 +0000
Subject: [PATCH 3/8] Add OpenAI speech API endpoint with Gemini TTS
 integration

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 README.md      |  17 +++++
 src/worker.mjs | 164 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+)

diff --git a/README.md b/README.md
index 8ceec43..608b876 100644
--- a/README.md
+++ b/README.md
@@ -194,3 +194,20 @@ For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-ap
 - [x] `embeddings`
   - [x] `dimensions`
 - [x] `models`
+- [x] `audio/speech` (Text-to-Speech)
+  <details>
+
+  - [x] `model`
+      - `tts-1` => `gemini-2.5-flash-preview-tts`
+      - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
+      - Can also specify Gemini model names directly
+  - [x] `input` (required)
+  - [x] `voice` (required)
+      - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
+      - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
+  - [x] `response_format`
+      - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm`
+      - Default: `mp3`
+  - [ ] `speed` (not yet implemented)
+
+  </details>
diff --git a/src/worker.mjs b/src/worker.mjs
index 000dab0..8b4b85d 100644
--- a/src/worker.mjs
+++ b/src/worker.mjs
@@ -31,6 +31,10 @@ export default {
           assert(request.method === "GET");
           return handleModels(apiKey)
             .catch(errHandler);
+        case pathname.endsWith("/audio/speech"):
+          assert(request.method === "POST");
+          return handleSpeech(await request.json(), apiKey)
+            .catch(errHandler);
         default:
           throw new HttpError("404 Not Found", 404);
       }
@@ -142,6 +146,166 @@ async function handleEmbeddings (req, apiKey) {
   return new Response(body, fixCors(response));
 }
 
+const DEFAULT_SPEECH_MODEL = "gemini-2.5-flash-preview-tts";
+// Map OpenAI voices to Gemini TTS voices
+// OpenAI: alloy, echo, fable, onyx, nova, shimmer
+// Gemini: Puck, Charon, Kore, Fenrir, Aoede, and many more
+const VOICE_MAP = {
+  "alloy": "Puck",      // Neutral, balanced
+  "echo": "Charon",     // Male voice
+  "fable": "Kore",      // Expressive
+  "onyx": "Fenrir",     // Deep, authoritative
+  "nova": "Aoede",      // Warm, friendly
+  "shimmer": "Aoede",   // Similar to nova
+};
+async function handleSpeech (req, apiKey) {
+  // Map OpenAI model names to Gemini TTS models
+  let model;
+  switch (true) {
+    case typeof req.model !== "string":
+      model = DEFAULT_SPEECH_MODEL;
+      break;
+    case req.model.startsWith("models/"):
+      model = req.model.substring(7);
+      break;
+    case req.model.startsWith("gemini-"):
+      model = req.model;
+      break;
+    case req.model === "tts-1":
+      model = DEFAULT_SPEECH_MODEL;
+      break;
+    case req.model === "tts-1-hd":
+      model = "gemini-2.5-pro-preview-tts";
+      break;
+    default:
+      model = DEFAULT_SPEECH_MODEL;
+  }
+  
+  if (!req.input) {
+    throw new HttpError("input is required", 400);
+  }
+  if (!req.voice) {
+    throw new HttpError("voice is required", 400);
+  }
+
+  // Map OpenAI voice to Gemini voice
+  const geminiVoice = VOICE_MAP[req.voice] || "Puck";
+
+  // Build Gemini request
+  const geminiRequest = {
+    contents: [{
+      parts: [{ text: req.input }]
+    }],
+    generationConfig: {
+      responseModalities: ["AUDIO"],
+      speechConfig: {
+        voiceConfig: {
+          prebuiltVoiceConfig: {
+            voiceName: geminiVoice
+          }
+        }
+      }
+    }
+  };
+
+  // Call Gemini API
+  const url = `${BASE_URL}/${API_VERSION}/models/${model}:generateContent`;
+  const response = await fetch(url, {
+    method: "POST",
+    headers: makeHeaders(apiKey, { "Content-Type": "application/json" }),
+    body: JSON.stringify(geminiRequest),
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text();
+    console.error("Gemini API error:", errorText);
+    return new Response(errorText, fixCors(response));
+  }
+
+  const geminiResponse = JSON.parse(await response.text());
+  
+  // Extract audio data from Gemini response
+  if (!geminiResponse.candidates?.[0]?.content?.parts?.[0]?.inlineData) {
+    throw new HttpError("No audio data in response", 500);
+  }
+
+  const audioData = geminiResponse.candidates[0].content.parts[0].inlineData;
+  const audioBytes = Buffer.from(audioData.data, "base64");
+
+  // Convert response format if needed
+  const responseFormat = req.response_format || "mp3";
+  let outputBuffer = audioBytes;
+  let mimeType = "audio/mpeg";
+
+  // Gemini returns PCM by default, we need to convert to the requested format
+  // For now, we'll handle basic formats
+  switch (responseFormat) {
+    case "mp3":
+      mimeType = "audio/mpeg";
+      // TODO: Convert PCM to MP3 if needed
+      // For now, return as-is since Gemini may support different output formats
+      break;
+    case "opus":
+      mimeType = "audio/opus";
+      break;
+    case "aac":
+      mimeType = "audio/aac";
+      break;
+    case "flac":
+      mimeType = "audio/flac";
+      break;
+    case "wav":
+      mimeType = "audio/wav";
+      // Convert PCM to WAV format
+      outputBuffer = convertPCMToWAV(audioBytes);
+      break;
+    case "pcm":
+      mimeType = "audio/pcm";
+      break;
+    default:
+      mimeType = "audio/mpeg";
+  }
+
+  return new Response(outputBuffer, {
+    headers: {
+      "Content-Type": mimeType,
+      "Access-Control-Allow-Origin": "*",
+    }
+  });
+}
+
+// Helper function to convert PCM to WAV format
+function convertPCMToWAV(pcmData) {
+  const sampleRate = 24000;
+  const numChannels = 1;
+  const bitsPerSample = 16;
+  
+  const dataSize = pcmData.length;
+  const buffer = Buffer.alloc(44 + dataSize);
+  
+  // RIFF header
+  buffer.write("RIFF", 0);
+  buffer.writeUInt32LE(36 + dataSize, 4);
+  buffer.write("WAVE", 8);
+  
+  // fmt chunk
+  buffer.write("fmt ", 12);
+  buffer.writeUInt32LE(16, 16); // fmt chunk size
+  buffer.writeUInt16LE(1, 20); // PCM format
+  buffer.writeUInt16LE(numChannels, 22);
+  buffer.writeUInt32LE(sampleRate, 24);
+  buffer.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28); // byte rate
+  buffer.writeUInt16LE(numChannels * bitsPerSample / 8, 32); // block align
+  buffer.writeUInt16LE(bitsPerSample, 34);
+  
+  // data chunk
+  buffer.write("data", 36);
+  buffer.writeUInt32LE(dataSize, 40);
+  pcmData.copy(buffer, 44);
+  
+  return buffer;
+}
+
 const DEFAULT_MODEL = "gemini-flash-latest";
 async function handleCompletions (req, apiKey) {
   let model;

From 05d73669a600892fe094951c190069d20ef03dae Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 09:52:52 +0000
Subject: [PATCH 4/8] Add TTS usage documentation and examples to README

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 README.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/README.md b/README.md
index 608b876..02d249e 100644
--- a/README.md
+++ b/README.md
@@ -146,6 +146,39 @@ The most notable of these is [`thinking_config`](https://ai.google.dev/gemini-ap
 
 For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-api/docs/openai#extra-body).
 
+
+## Text-to-Speech (TTS)
+
+The `/v1/audio/speech` endpoint provides OpenAI-compatible text-to-speech functionality powered by Gemini's TTS models.
+
+### Example Usage
+
+```bash
+curl https://your-endpoint.com/v1/audio/speech \
+  -H "Authorization: Bearer YOUR_GEMINI_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "tts-1",
+    "input": "Hello! This is a test of the text-to-speech API.",
+    "voice": "alloy",
+    "response_format": "mp3"
+  }' \
+  --output speech.mp3
+```
+
+### Model Mapping
+- `tts-1` → `gemini-2.5-flash-preview-tts` (faster, optimized for speed)
+- `tts-1-hd` → `gemini-2.5-pro-preview-tts` (higher quality)
+
+### Voice Mapping
+OpenAI voices are mapped to Gemini TTS voices:
+- `alloy` → Puck (neutral, balanced)
+- `echo` → Charon (male voice)
+- `fable` → Kore (expressive)
+- `onyx` → Fenrir (deep, authoritative)
+- `nova` → Aoede (warm, friendly)
+- `shimmer` → Aoede (similar to nova)
+
 ---
 
 ## Supported API endpoints and applicable parameters

From f2d39aa653d8001cf97db36b6c292d98c8eed364 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:48:40 +0000
Subject: [PATCH 5/8] Initial plan


From 7515a7860be0c3e37fd7add858d4ff65cb745d00 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 01:51:54 +0000
Subject: [PATCH 6/8] Fix audio format conversion - limit to WAV and PCM only

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 README.md      |  9 +++++----
 src/worker.mjs | 38 +++++++++++++++++++-------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 02d249e..cb6cfea 100644
--- a/README.md
+++ b/README.md
@@ -161,9 +161,9 @@ curl https://your-endpoint.com/v1/audio/speech \
     "model": "tts-1",
     "input": "Hello! This is a test of the text-to-speech API.",
     "voice": "alloy",
-    "response_format": "mp3"
+    "response_format": "wav"
   }' \
-  --output speech.mp3
+  --output speech.wav
 ```
 
 ### Model Mapping
@@ -239,8 +239,9 @@ OpenAI voices are mapped to Gemini TTS voices:
       - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
       - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
   - [x] `response_format`
-      - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm`
-      - Default: `mp3`
+      - Supported: `wav`, `pcm`
+      - Default: `wav`
+      - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg.
   - [ ] `speed` (not yet implemented)
 
   </details>
diff --git a/src/worker.mjs b/src/worker.mjs
index 8b4b85d..6e50841 100644
--- a/src/worker.mjs
+++ b/src/worker.mjs
@@ -233,27 +233,13 @@ async function handleSpeech (req, apiKey) {
   const audioBytes = Buffer.from(audioData.data, "base64");
 
   // Convert response format if needed
-  const responseFormat = req.response_format || "mp3";
+  // Gemini returns raw PCM audio data (24 kHz, 16-bit, mono)
+  // Only WAV and PCM formats are currently supported
+  const responseFormat = req.response_format || "wav";
   let outputBuffer = audioBytes;
-  let mimeType = "audio/mpeg";
+  let mimeType;
 
-  // Gemini returns PCM by default, we need to convert to the requested format
-  // For now, we'll handle basic formats
   switch (responseFormat) {
-    case "mp3":
-      mimeType = "audio/mpeg";
-      // TODO: Convert PCM to MP3 if needed
-      // For now, return as-is since Gemini may support different output formats
-      break;
-    case "opus":
-      mimeType = "audio/opus";
-      break;
-    case "aac":
-      mimeType = "audio/aac";
-      break;
-    case "flac":
-      mimeType = "audio/flac";
-      break;
     case "wav":
       mimeType = "audio/wav";
       // Convert PCM to WAV format
@@ -261,9 +247,23 @@ async function handleSpeech (req, apiKey) {
       break;
     case "pcm":
       mimeType = "audio/pcm";
+      // Return raw PCM data
       break;
+    case "mp3":
+    case "opus":
+    case "aac":
+    case "flac":
+      throw new HttpError(
+        `Format "${responseFormat}" is not supported. Gemini returns raw PCM audio. ` +
+        `Only "wav" and "pcm" formats are supported. ` +
+        `For other formats, use external conversion tools like ffmpeg.`,
+        400
+      );
     default:
-      mimeType = "audio/mpeg";
+      throw new HttpError(
+        `Unknown response_format: "${responseFormat}". Supported formats: "wav", "pcm"`,
+        400
+      );
   }
 
   return new Response(outputBuffer, {

From 2786f6d0bceca4a43976dae27599e008a4195d47 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:07:39 +0000
Subject: [PATCH 7/8] Initial plan


From 2aa8d0728dce8ff5faca89f6d459fc33daa89176 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 09:09:48 +0000
Subject: [PATCH 8/8] Fix markdown list indentation violations in README.md

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index cb6cfea..c8abe22 100644
--- a/README.md
+++ b/README.md
@@ -231,17 +231,17 @@ OpenAI voices are mapped to Gemini TTS voices:
   <details>
 
   - [x] `model`
-      - `tts-1` => `gemini-2.5-flash-preview-tts`
-      - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
-      - Can also specify Gemini model names directly
+    - `tts-1` => `gemini-2.5-flash-preview-tts`
+    - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
+    - Can also specify Gemini model names directly
   - [x] `input` (required)
   - [x] `voice` (required)
-      - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
-      - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
+    - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
+    - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
   - [x] `response_format`
-      - Supported: `wav`, `pcm`
-      - Default: `wav`
-      - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg.
+    - Supported: `wav`, `pcm`
+    - Default: `wav`
+    - Note: Gemini returns raw PCM audio (24 kHz, 16-bit, mono). Only WAV (with proper headers) and raw PCM formats are supported. For mp3, opus, aac, or flac, use external conversion tools like ffmpeg.
   - [ ] `speed` (not yet implemented)
 
   </details>