PublicAffairs · groxaxo · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/README.md b/README.md
@@ -146,6 +146,39 @@ The most notable of these is [`thinking_config`](https://ai.google.dev/gemini-ap
 
 For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-api/docs/openai#extra-body).
 
+
+## Text-to-Speech (TTS)
+
+The `/v1/audio/speech` endpoint provides OpenAI-compatible text-to-speech functionality powered by Gemini's TTS models.
+
+### Example Usage
+
+```bash
+curl https://your-endpoint.com/v1/audio/speech \
+  -H "Authorization: Bearer YOUR_GEMINI_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "tts-1",
+    "input": "Hello! This is a test of the text-to-speech API.",
+    "voice": "alloy",
+    "response_format": "mp3"
+  }' \
+  --output speech.mp3
+```
+
+### Model Mapping
+- `tts-1` → `gemini-2.5-flash-preview-tts` (faster, optimized for speed)
+- `tts-1-hd` → `gemini-2.5-pro-preview-tts` (higher quality)
+
+### Voice Mapping
+OpenAI voices are mapped to Gemini TTS voices:
+- `alloy` → Puck (neutral, balanced)
+- `echo` → Charon (male voice)
+- `fable` → Kore (expressive)
+- `onyx` → Fenrir (deep, authoritative)
+- `nova` → Aoede (warm, friendly)
+- `shimmer` → Aoede (similar to nova)
+
 ---
 
 ## Supported API endpoints and applicable parameters
@@ -194,3 +227,20 @@ For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-ap
 - [x] `embeddings`
   - [x] `dimensions`
 - [x] `models`
+- [x] `audio/speech` (Text-to-Speech)
+  <details>
+
+  - [x] `model`
+      - `tts-1` => `gemini-2.5-flash-preview-tts`
+      - `tts-1-hd` => `gemini-2.5-pro-preview-tts`
+      - Can also specify Gemini model names directly
+  - [x] `input` (required)
+  - [x] `voice` (required)
+      - Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
+      - Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
+  - [x] `response_format`
+      - Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm`
+      - Default: `mp3`
+  - [ ] `speed` (not yet implemented)
+
+  </details>
diff --git a/package-lock.json b/package-lock.json
diff --git a/src/worker.mjs b/src/worker.mjs
@@ -31,6 +31,10 @@ export default {
           assert(request.method === "GET");
           return handleModels(apiKey)
             .catch(errHandler);
+        case pathname.endsWith("/audio/speech"):
+          assert(request.method === "POST");
+          return handleSpeech(await request.json(), apiKey)
+            .catch(errHandler);
         default:
           throw new HttpError("404 Not Found", 404);
       }
@@ -142,6 +146,166 @@ async function handleEmbeddings (req, apiKey) {
   return new Response(body, fixCors(response));
 }
 
+const DEFAULT_SPEECH_MODEL = "gemini-2.5-flash-preview-tts";
+// Map OpenAI voices to Gemini TTS voices
+// OpenAI: alloy, echo, fable, onyx, nova, shimmer
+// Gemini: Puck, Charon, Kore, Fenrir, Aoede, and many more
+const VOICE_MAP = {
+  "alloy": "Puck",      // Neutral, balanced
+  "echo": "Charon",     // Male voice
+  "fable": "Kore",      // Expressive
+  "onyx": "Fenrir",     // Deep, authoritative
+  "nova": "Aoede",      // Warm, friendly
+  "shimmer": "Aoede",   // Similar to nova
+};
+async function handleSpeech (req, apiKey) {
+  // Map OpenAI model names to Gemini TTS models
+  let model;
+  switch (true) {
+    case typeof req.model !== "string":
+      model = DEFAULT_SPEECH_MODEL;
+      break;
+    case req.model.startsWith("models/"):
+      model = req.model.substring(7);
+      break;
+    case req.model.startsWith("gemini-"):
+      model = req.model;
+      break;
+    case req.model === "tts-1":
+      model = DEFAULT_SPEECH_MODEL;
+      break;
+    case req.model === "tts-1-hd":
+      model = "gemini-2.5-pro-preview-tts";
+      break;
+    default:
+      model = DEFAULT_SPEECH_MODEL;
+  }
+
+  if (!req.input) {
+    throw new HttpError("input is required", 400);
+  }
+  if (!req.voice) {
+    throw new HttpError("voice is required", 400);
+  }
+
+  // Map OpenAI voice to Gemini voice
+  const geminiVoice = VOICE_MAP[req.voice] || "Puck";
+
+  // Build Gemini request
+  const geminiRequest = {
+    contents: [{
+      parts: [{ text: req.input }]
+    }],
+    generationConfig: {
+      responseModalities: ["AUDIO"],
+      speechConfig: {
+        voiceConfig: {
+          prebuiltVoiceConfig: {
+            voiceName: geminiVoice
+          }
+        }
+      }
+    }
+  };
+
+  // Call Gemini API
+  const url = `${BASE_URL}/${API_VERSION}/models/${model}:generateContent`;
+  const response = await fetch(url, {
+    method: "POST",
+    headers: makeHeaders(apiKey, { "Content-Type": "application/json" }),
+    body: JSON.stringify(geminiRequest),
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text();
+    console.error("Gemini API error:", errorText);
+    return new Response(errorText, fixCors(response));
+  }
+
+  const geminiResponse = JSON.parse(await response.text());
+
+  // Extract audio data from Gemini response
+  if (!geminiResponse.candidates?.[0]?.content?.parts?.[0]?.inlineData) {
+    throw new HttpError("No audio data in response", 500);
+  }
+
+  const audioData = geminiResponse.candidates[0].content.parts[0].inlineData;
+  const audioBytes = Buffer.from(audioData.data, "base64");
+
+  // Convert response format if needed
+  const responseFormat = req.response_format || "mp3";
+  let outputBuffer = audioBytes;
+  let mimeType = "audio/mpeg";
+
+  // Gemini returns PCM by default, we need to convert to the requested format
+  // For now, we'll handle basic formats
+  switch (responseFormat) {
+    case "mp3":
+      mimeType = "audio/mpeg";
+      // TODO: Convert PCM to MP3 if needed
+      // For now, return as-is since Gemini may support different output formats
+      break;
+    case "opus":
+      mimeType = "audio/opus";
+      break;
+    case "aac":
+      mimeType = "audio/aac";
+      break;
+    case "flac":
+      mimeType = "audio/flac";
+      break;
+    case "wav":
+      mimeType = "audio/wav";
+      // Convert PCM to WAV format
+      outputBuffer = convertPCMToWAV(audioBytes);
+      break;
+    case "pcm":
+      mimeType = "audio/pcm";
+      break;
+    default:
+      mimeType = "audio/mpeg";
+  }
+
+  return new Response(outputBuffer, {
+    headers: {
+      "Content-Type": mimeType,
+      "Access-Control-Allow-Origin": "*",
+    }
+  });
+}
+
+// Helper function to convert PCM to WAV format
+function convertPCMToWAV(pcmData) {
+  const sampleRate = 24000;
+  const numChannels = 1;
+  const bitsPerSample = 16;
+
+  const dataSize = pcmData.length;
+  const buffer = Buffer.alloc(44 + dataSize);
+
+  // RIFF header
+  buffer.write("RIFF", 0);
+  buffer.writeUInt32LE(36 + dataSize, 4);
+  buffer.write("WAVE", 8);
+
+  // fmt chunk
+  buffer.write("fmt ", 12);
+  buffer.writeUInt32LE(16, 16); // fmt chunk size
+  buffer.writeUInt16LE(1, 20); // PCM format
+  buffer.writeUInt16LE(numChannels, 22);
+  buffer.writeUInt32LE(sampleRate, 24);
+  buffer.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28); // byte rate
+  buffer.writeUInt16LE(numChannels * bitsPerSample / 8, 32); // block align
+  buffer.writeUInt16LE(bitsPerSample, 34);
+
+  // data chunk
+  buffer.write("data", 36);
+  buffer.writeUInt32LE(dataSize, 40);
+  pcmData.copy(buffer, 44);
+
+  return buffer;
+}
+
 const DEFAULT_MODEL = "gemini-flash-latest";
 async function handleCompletions (req, apiKey) {
   let model;