Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 265 additions & 0 deletions packages/core/src/utils/tokenCalculation.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,84 @@ describe('tokenCalculation', () => {
// PDF estimate: 25800 tokens (~100 pages at 258 tokens/page)
expect(count).toBe(25800);
});

it('should use countTokens API for audio requests', async () => {
vi.mocked(mockContentGenerator.countTokens).mockResolvedValue({
totalTokens: 960,
});
const request = [
{ inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } },
];

const count = await calculateRequestTokenCount(
request,
mockContentGenerator,
model,
);

expect(count).toBe(960);
expect(mockContentGenerator.countTokens).toHaveBeenCalled();
});

it('should use duration-based estimate for audio in fallback', async () => {
vi.mocked(mockContentGenerator.countTokens).mockRejectedValue(
new Error('API error'),
);
// ~5 seconds of audio: 5 * 16,000 = 80,000 bytes -> base64 ≈ 106,667
const base64Data = 'A'.repeat(106_667);
const request = [
{ inlineData: { mimeType: 'audio/mpeg', data: base64Data } },
];

const count = await calculateRequestTokenCount(
request,
mockContentGenerator,
model,
);

// rawBytes ≈ 80,000 -> ~5s -> ceil(5 * 32) = 160
expect(count).toBeGreaterThanOrEqual(155);
expect(count).toBeLessThanOrEqual(165);
});

it('should use countTokens API for video requests', async () => {
vi.mocked(mockContentGenerator.countTokens).mockResolvedValue({
totalTokens: 5800,
});
const request = [
{ inlineData: { mimeType: 'video/mp4', data: 'video_data' } },
];

const count = await calculateRequestTokenCount(
request,
mockContentGenerator,
model,
);

expect(count).toBe(5800);
expect(mockContentGenerator.countTokens).toHaveBeenCalled();
});

it('should use duration-based estimate for video in fallback', async () => {
vi.mocked(mockContentGenerator.countTokens).mockRejectedValue(
new Error('API error'),
);
// ~5 seconds of video: 5 * 250,000 = 1,250,000 bytes -> base64 ≈ 1,666,667
const base64Data = 'V'.repeat(1_666_667);
const request = [
{ inlineData: { mimeType: 'video/mp4', data: base64Data } },
];

const count = await calculateRequestTokenCount(
request,
mockContentGenerator,
model,
);

// rawBytes ≈ 1,250,000 -> ~5s -> ceil(5 * 290) = 1450
expect(count).toBeGreaterThanOrEqual(1440);
expect(count).toBeLessThanOrEqual(1460);
});
});

describe('estimateTokenCountSync', () => {
Expand Down Expand Up @@ -285,5 +363,192 @@ describe('tokenCalculation', () => {
expect(estimateTokenCountSync([{ text: '' }])).toBe(0);
expect(estimateTokenCountSync([{} as Part])).toBe(0);
});

describe('audio token estimation', () => {
it('should estimate audio tokens from inlineData base64 size', () => {
// Simulate ~10 seconds of 128kbps MP3 audio:
// 10s * 16,000 bytes/s = 160,000 raw bytes
// base64 length = 160,000 / 0.75 ≈ 213,333 chars
const base64Data = 'A'.repeat(213_333);
const parts: Part[] = [
{ inlineData: { mimeType: 'audio/mpeg', data: base64Data } },
];

const tokens = estimateTokenCountSync(parts);
// rawBytes = 213,333 * 0.75 = 159,999.75
// duration = 159,999.75 / 16,000 ≈ 10s
// tokens = ceil(10 * 32) = 320
expect(tokens).toBeGreaterThanOrEqual(310);
expect(tokens).toBeLessThanOrEqual(330);
});

it('should use default estimate for audio fileData without base64', () => {
const parts: Part[] = [
{
fileData: {
mimeType: 'audio/wav',
fileUri: 'gs://bucket/recording.wav',
},
},
];

const tokens = estimateTokenCountSync(parts);
// Default audio estimate: 3840 tokens (~2 min at 32 tokens/sec)
expect(tokens).toBe(3840);
});

it('should handle various audio MIME types', () => {
const mimeTypes = [
'audio/mpeg',
'audio/wav',
'audio/ogg',
'audio/flac',
'audio/aac',
'audio/mp4',
];

for (const mimeType of mimeTypes) {
const parts: Part[] = [
{
fileData: { mimeType, fileUri: 'gs://bucket/file' },
},
];
// All should use the audio estimation path, not the JSON fallback
expect(estimateTokenCountSync(parts)).toBe(3840);
}
});

it('should return 0 tokens for empty base64 data', () => {
const parts: Part[] = [
{ inlineData: { mimeType: 'audio/mpeg', data: '' } },
];

const tokens = estimateTokenCountSync(parts);
// Empty data = 0 bytes = 0 seconds = 0 tokens
expect(tokens).toBe(0);
});

it('should estimate small audio clips with minimal tokens', () => {
// ~1 second of audio: 16,000 raw bytes -> base64 ≈ 21,333 chars
const base64Data = 'B'.repeat(21_333);
const parts: Part[] = [
{ inlineData: { mimeType: 'audio/mp4', data: base64Data } },
];

const tokens = estimateTokenCountSync(parts);
// rawBytes ≈ 16,000 -> duration ≈ 1s -> tokens = ceil(32) = 32
expect(tokens).toBeGreaterThanOrEqual(30);
expect(tokens).toBeLessThanOrEqual(34);
});
});

describe('video token estimation', () => {
it('should estimate video tokens from inlineData base64 size', () => {
// Simulate ~10 seconds of ~2Mbps video:
// 10s * 250,000 bytes/s = 2,500,000 raw bytes
// base64 length = 2,500,000 / 0.75 ≈ 3,333,333 chars
const base64Data = 'V'.repeat(3_333_333);
const parts: Part[] = [
{ inlineData: { mimeType: 'video/mp4', data: base64Data } },
];

const tokens = estimateTokenCountSync(parts);
// rawBytes ≈ 2,500,000 -> duration ≈ 10s
// tokens = ceil(10 * 290) = 2900
expect(tokens).toBeGreaterThanOrEqual(2880);
expect(tokens).toBeLessThanOrEqual(2920);
});

it('should use default estimate for video fileData without base64', () => {
const parts: Part[] = [
{
fileData: {
mimeType: 'video/mp4',
fileUri: 'gs://bucket/clip.mp4',
},
},
];

const tokens = estimateTokenCountSync(parts);
// Default video estimate: 17,400 tokens (~1 min at 290 tokens/sec)
expect(tokens).toBe(17400);
});

it('should return 0 tokens for empty base64 video data', () => {
const parts: Part[] = [
{ inlineData: { mimeType: 'video/mp4', data: '' } },
];

const tokens = estimateTokenCountSync(parts);
// Empty data = 0 bytes = 0 seconds = 0 tokens
expect(tokens).toBe(0);
});

it('should handle various video MIME types', () => {
const mimeTypes = [
'video/mp4',
'video/webm',
'video/quicktime',
'video/x-msvideo',
];

for (const mimeType of mimeTypes) {
const parts: Part[] = [
{
fileData: { mimeType, fileUri: 'gs://bucket/file' },
},
];
expect(estimateTokenCountSync(parts)).toBe(17400);
}
});
});

describe('mixed multimodal content', () => {
it('should correctly sum tokens for text + audio + image parts', () => {
// "Describe this audio" = 19 ASCII chars -> 19 * 0.25 = 4.75 tokens
// audio fileData -> 3840 default tokens
// image -> 3000 tokens
const parts: Part[] = [
{ text: 'Describe this audio' },
{
fileData: {
mimeType: 'audio/mpeg',
fileUri: 'gs://bucket/speech.mp3',
},
},
{ inlineData: { mimeType: 'image/png', data: 'img_data' } },
];

const tokens = estimateTokenCountSync(parts);
// floor(4.75 + 3840 + 3000) = floor(6844.75) = 6844
expect(tokens).toBe(6844);
});

it('should handle Gemini 3 nested audio parts in functionResponse', () => {
const parts: Part[] = [
{
functionResponse: {
name: 'audio_tool',
id: '789',
response: { status: 'ok' },
parts: [
{
fileData: {
mimeType: 'audio/wav',
fileUri: 'gs://bucket/output.wav',
},
},
{ text: 'Audio transcription here' },
] as Part[],
},
},
];

const tokens = estimateTokenCountSync(parts);
// audio default 3840 + text 6 + response ~4 + name ~2 = ~3852
expect(tokens).toBeGreaterThan(3840);
expect(tokens).toBeLessThan(3900);
});
});
});
});
65 changes: 64 additions & 1 deletion packages/core/src/utils/tokenCalculation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,26 @@ const IMAGE_TOKEN_ESTIMATE = 3000;
// See: https://ai.google.dev/gemini-api/docs/document-processing
const PDF_TOKEN_ESTIMATE = 25800;

// Audio token estimation constants.
// The Gemini API tokenizes audio at ~32 tokens per second.
// See: https://ai.google.dev/gemini-api/docs/audio
const AUDIO_TOKENS_PER_SECOND = 32;
// Conservative bitrate for compressed audio duration estimation (128 kbps).
// Used to convert raw file size to an approximate duration.
const COMPRESSED_AUDIO_BYTES_PER_SECOND = 16_000;
// Default audio token estimate when base64 data is unavailable (~2 min).
const DEFAULT_AUDIO_TOKEN_ESTIMATE = 120 * AUDIO_TOKENS_PER_SECOND;

// Video token estimation constants.
// Video frames are tokenized at 258 tokens/frame at 1 fps, plus the audio
// track at 32 tokens/second.
// See: https://ai.google.dev/gemini-api/docs/vision#video
const VIDEO_TOKENS_PER_SECOND = 258 + AUDIO_TOKENS_PER_SECOND;
// Conservative bitrate for compressed video duration estimation (~2 Mbps).
const COMPRESSED_VIDEO_BYTES_PER_SECOND = 250_000;
// Default video token estimate when base64 data is unavailable (~1 min).
const DEFAULT_VIDEO_TOKEN_ESTIMATE = 60 * VIDEO_TOKENS_PER_SECOND;

// Maximum number of characters to process with the full character-by-character heuristic.
// Above this, we use a faster approximation to avoid performance bottlenecks.
const MAX_CHARS_FOR_FULL_HEURISTIC = 100_000;
Expand Down Expand Up @@ -50,7 +70,42 @@ function estimateTextTokens(text: string): number {
}

/**
* Heuristic estimation for media parts (images, PDFs) using fixed safe estimates.
* Estimates audio tokens from base64 data size.
*
* Converts the base64 length to a raw byte count, estimates the audio duration
* using a conservative compressed-audio bitrate, then multiplies by the Gemini
* API's audio token rate (32 tokens/second).
*
* When no base64 data is available (e.g. fileData references), returns a fixed
* default estimate.
*/
function estimateAudioTokens(base64Data: string | undefined): number {
if (base64Data === undefined) return DEFAULT_AUDIO_TOKEN_ESTIMATE;
const rawBytes = base64Data.length * 0.75;
const estimatedSeconds = rawBytes / COMPRESSED_AUDIO_BYTES_PER_SECOND;
return Math.ceil(estimatedSeconds * AUDIO_TOKENS_PER_SECOND);
}

/**
* Estimates video tokens from base64 data size.
*
* Converts the base64 length to a raw byte count, estimates the video duration
* using a conservative compressed-video bitrate, then multiplies by the
* combined frame + audio token rate (258 + 32 = 290 tokens/second).
*
* When no base64 data is available (e.g. fileData references), returns a fixed
* default estimate.
*/
function estimateVideoTokens(base64Data: string | undefined): number {
if (base64Data === undefined) return DEFAULT_VIDEO_TOKEN_ESTIMATE;
const rawBytes = base64Data.length * 0.75;
const estimatedSeconds = rawBytes / COMPRESSED_VIDEO_BYTES_PER_SECOND;
return Math.ceil(estimatedSeconds * VIDEO_TOKENS_PER_SECOND);
}

/**
* Heuristic estimation for media parts (images, PDFs, audio, video) using
* either fixed safe estimates or data-size-based duration estimation.
*/
function estimateMediaTokens(part: Part): number | undefined {
const inlineData = 'inlineData' in part ? part.inlineData : undefined;
Expand All @@ -65,6 +120,14 @@ function estimateMediaTokens(part: Part): number | undefined {
// PDFs: 25,800 tokens (~100 pages at 258 tokens/page)
// See: https://ai.google.dev/gemini-api/docs/document-processing
return PDF_TOKEN_ESTIMATE;
} else if (mimeType?.startsWith('audio/')) {
// Audio: ~32 tokens per second of audio content.
// See: https://ai.google.dev/gemini-api/docs/audio
return estimateAudioTokens(inlineData?.data);
} else if (mimeType?.startsWith('video/')) {
// Video: 258 tokens/frame at 1 fps + 32 tokens/sec for the audio track.
// See: https://ai.google.dev/gemini-api/docs/vision#video
return estimateVideoTokens(inlineData?.data);
}
return undefined;
}
Expand Down