Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,39 @@ The most notable of these is [`thinking_config`](https://ai.google.dev/gemini-ap

For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-api/docs/openai#extra-body).


## Text-to-Speech (TTS)

The `/v1/audio/speech` endpoint provides OpenAI-compatible text-to-speech functionality powered by Gemini's TTS models.

### Example Usage

```bash
curl https://your-endpoint.com/v1/audio/speech \
-H "Authorization: Bearer YOUR_GEMINI_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "tts-1",
"input": "Hello! This is a test of the text-to-speech API.",
"voice": "alloy",
"response_format": "mp3"
}' \
--output speech.mp3
```

### Model Mapping
- `tts-1` → `gemini-2.5-flash-preview-tts` (faster, optimized for speed)
- `tts-1-hd` → `gemini-2.5-pro-preview-tts` (higher quality)

### Voice Mapping
OpenAI voices are mapped to Gemini TTS voices:
- `alloy` → Puck (neutral, balanced)
- `echo` → Charon (male voice)
- `fable` → Kore (expressive)
- `onyx` → Fenrir (deep, authoritative)
- `nova` → Aoede (warm, friendly)
- `shimmer` → Aoede (similar to nova)

---

## Supported API endpoints and applicable parameters
Expand Down Expand Up @@ -194,3 +227,20 @@ For more details, refer to the [Gemini API docs](https://ai.google.dev/gemini-ap
- [x] `embeddings`
- [x] `dimensions`
- [x] `models`
- [x] `audio/speech` (Text-to-Speech)
<details>

- [x] `model`
- `tts-1` => `gemini-2.5-flash-preview-tts`
- `tts-1-hd` => `gemini-2.5-pro-preview-tts`
- Can also specify Gemini model names directly
- [x] `input` (required)
- [x] `voice` (required)
- Supported: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
- Maps to Gemini voices: Puck, Charon, Kore, Fenrir, Aoede
- [x] `response_format`
- Supported: `mp3`, `opus`, `aac`, `flac`, `wav`, `pcm`
- Default: `mp3`
- [ ] `speed` (not yet implemented)

</details>
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

164 changes: 164 additions & 0 deletions src/worker.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ export default {
assert(request.method === "GET");
return handleModels(apiKey)
.catch(errHandler);
case pathname.endsWith("/audio/speech"):
assert(request.method === "POST");
return handleSpeech(await request.json(), apiKey)
.catch(errHandler);
default:
throw new HttpError("404 Not Found", 404);
}
Expand Down Expand Up @@ -142,6 +146,166 @@ async function handleEmbeddings (req, apiKey) {
return new Response(body, fixCors(response));
}

const DEFAULT_SPEECH_MODEL = "gemini-2.5-flash-preview-tts";
// Map OpenAI voices to Gemini TTS voices
// OpenAI: alloy, echo, fable, onyx, nova, shimmer
// Gemini: Puck, Charon, Kore, Fenrir, Aoede, and many more
const VOICE_MAP = {
"alloy": "Puck", // Neutral, balanced
"echo": "Charon", // Male voice
"fable": "Kore", // Expressive
"onyx": "Fenrir", // Deep, authoritative
"nova": "Aoede", // Warm, friendly
"shimmer": "Aoede", // Similar to nova
};
async function handleSpeech (req, apiKey) {
// Map OpenAI model names to Gemini TTS models
let model;
switch (true) {
case typeof req.model !== "string":
model = DEFAULT_SPEECH_MODEL;
break;
case req.model.startsWith("models/"):
model = req.model.substring(7);
break;
case req.model.startsWith("gemini-"):
model = req.model;
break;
case req.model === "tts-1":
model = DEFAULT_SPEECH_MODEL;
break;
case req.model === "tts-1-hd":
model = "gemini-2.5-pro-preview-tts";
break;
default:
model = DEFAULT_SPEECH_MODEL;
}

if (!req.input) {
throw new HttpError("input is required", 400);
}
if (!req.voice) {
throw new HttpError("voice is required", 400);
}

// Map OpenAI voice to Gemini voice
const geminiVoice = VOICE_MAP[req.voice] || "Puck";

// Build Gemini request
const geminiRequest = {
contents: [{
parts: [{ text: req.input }]
}],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: geminiVoice
}
}
}
}
};

// Call Gemini API
const url = `${BASE_URL}/${API_VERSION}/models/${model}:generateContent`;
const response = await fetch(url, {
method: "POST",
headers: makeHeaders(apiKey, { "Content-Type": "application/json" }),
body: JSON.stringify(geminiRequest),
});

if (!response.ok) {
const errorText = await response.text();
console.error("Gemini API error:", errorText);
return new Response(errorText, fixCors(response));
}

const geminiResponse = JSON.parse(await response.text());

// Extract audio data from Gemini response
if (!geminiResponse.candidates?.[0]?.content?.parts?.[0]?.inlineData) {
throw new HttpError("No audio data in response", 500);
}

const audioData = geminiResponse.candidates[0].content.parts[0].inlineData;
const audioBytes = Buffer.from(audioData.data, "base64");

// Convert response format if needed
const responseFormat = req.response_format || "mp3";
let outputBuffer = audioBytes;
let mimeType = "audio/mpeg";

// Gemini returns PCM by default, we need to convert to the requested format
// For now, we'll handle basic formats
switch (responseFormat) {
case "mp3":
mimeType = "audio/mpeg";
// TODO: Convert PCM to MP3 if needed
// For now, return as-is since Gemini may support different output formats
break;
case "opus":
mimeType = "audio/opus";
break;
case "aac":
mimeType = "audio/aac";
break;
case "flac":
mimeType = "audio/flac";
break;
case "wav":
mimeType = "audio/wav";
// Convert PCM to WAV format
outputBuffer = convertPCMToWAV(audioBytes);
break;
case "pcm":
mimeType = "audio/pcm";
break;
default:
mimeType = "audio/mpeg";
}

return new Response(outputBuffer, {
headers: {
"Content-Type": mimeType,
"Access-Control-Allow-Origin": "*",
}
});
}

// Helper function to convert PCM to WAV format
function convertPCMToWAV(pcmData) {
const sampleRate = 24000;
const numChannels = 1;
const bitsPerSample = 16;

const dataSize = pcmData.length;
const buffer = Buffer.alloc(44 + dataSize);

// RIFF header
buffer.write("RIFF", 0);
buffer.writeUInt32LE(36 + dataSize, 4);
buffer.write("WAVE", 8);

// fmt chunk
buffer.write("fmt ", 12);
buffer.writeUInt32LE(16, 16); // fmt chunk size
buffer.writeUInt16LE(1, 20); // PCM format
buffer.writeUInt16LE(numChannels, 22);
buffer.writeUInt32LE(sampleRate, 24);
buffer.writeUInt32LE(sampleRate * numChannels * bitsPerSample / 8, 28); // byte rate
buffer.writeUInt16LE(numChannels * bitsPerSample / 8, 32); // block align
buffer.writeUInt16LE(bitsPerSample, 34);

// data chunk
buffer.write("data", 36);
buffer.writeUInt32LE(dataSize, 40);
pcmData.copy(buffer, 44);

return buffer;
}

const DEFAULT_MODEL = "gemini-flash-latest";
async function handleCompletions (req, apiKey) {
let model;
Expand Down