diff --git a/packages/core/src/utils/tokenCalculation.test.ts b/packages/core/src/utils/tokenCalculation.test.ts index e642669708e..a4e06edf3bb 100644 --- a/packages/core/src/utils/tokenCalculation.test.ts +++ b/packages/core/src/utils/tokenCalculation.test.ts @@ -178,6 +178,84 @@ describe('tokenCalculation', () => { // PDF estimate: 25800 tokens (~100 pages at 258 tokens/page) expect(count).toBe(25800); }); + + it('should use countTokens API for audio requests', async () => { + vi.mocked(mockContentGenerator.countTokens).mockResolvedValue({ + totalTokens: 960, + }); + const request = [ + { inlineData: { mimeType: 'audio/mpeg', data: 'audio_data' } }, + ]; + + const count = await calculateRequestTokenCount( + request, + mockContentGenerator, + model, + ); + + expect(count).toBe(960); + expect(mockContentGenerator.countTokens).toHaveBeenCalled(); + }); + + it('should use duration-based estimate for audio in fallback', async () => { + vi.mocked(mockContentGenerator.countTokens).mockRejectedValue( + new Error('API error'), + ); + // ~5 seconds of audio: 5 * 16,000 = 80,000 bytes -> base64 ≈ 106,667 + const base64Data = 'A'.repeat(106_667); + const request = [ + { inlineData: { mimeType: 'audio/mpeg', data: base64Data } }, + ]; + + const count = await calculateRequestTokenCount( + request, + mockContentGenerator, + model, + ); + + // rawBytes ≈ 80,000 -> ~5s -> ceil(5 * 32) = 160 + expect(count).toBeGreaterThanOrEqual(155); + expect(count).toBeLessThanOrEqual(165); + }); + + it('should use countTokens API for video requests', async () => { + vi.mocked(mockContentGenerator.countTokens).mockResolvedValue({ + totalTokens: 5800, + }); + const request = [ + { inlineData: { mimeType: 'video/mp4', data: 'video_data' } }, + ]; + + const count = await calculateRequestTokenCount( + request, + mockContentGenerator, + model, + ); + + expect(count).toBe(5800); + expect(mockContentGenerator.countTokens).toHaveBeenCalled(); + }); + + it('should use duration-based estimate for video in fallback', async () => { + vi.mocked(mockContentGenerator.countTokens).mockRejectedValue( + new Error('API error'), + ); + // ~5 seconds of video: 5 * 250,000 = 1,250,000 bytes -> base64 ≈ 1,666,667 + const base64Data = 'V'.repeat(1_666_667); + const request = [ + { inlineData: { mimeType: 'video/mp4', data: base64Data } }, + ]; + + const count = await calculateRequestTokenCount( + request, + mockContentGenerator, + model, + ); + + // rawBytes ≈ 1,250,000 -> ~5s -> ceil(5 * 290) = 1450 + expect(count).toBeGreaterThanOrEqual(1440); + expect(count).toBeLessThanOrEqual(1460); + }); }); describe('estimateTokenCountSync', () => { @@ -285,5 +363,192 @@ describe('tokenCalculation', () => { expect(estimateTokenCountSync([{ text: '' }])).toBe(0); expect(estimateTokenCountSync([{} as Part])).toBe(0); }); + + describe('audio token estimation', () => { + it('should estimate audio tokens from inlineData base64 size', () => { + // Simulate ~10 seconds of 128kbps MP3 audio: + // 10s * 16,000 bytes/s = 160,000 raw bytes + // base64 length = 160,000 / 0.75 ≈ 213,333 chars + const base64Data = 'A'.repeat(213_333); + const parts: Part[] = [ + { inlineData: { mimeType: 'audio/mpeg', data: base64Data } }, + ]; + + const tokens = estimateTokenCountSync(parts); + // rawBytes = 213,333 * 0.75 = 159,999.75 + // duration = 159,999.75 / 16,000 ≈ 10s + // tokens = ceil(10 * 32) = 320 + expect(tokens).toBeGreaterThanOrEqual(310); + expect(tokens).toBeLessThanOrEqual(330); + }); + + it('should use default estimate for audio fileData without base64', () => { + const parts: Part[] = [ + { + fileData: { + mimeType: 'audio/wav', + fileUri: 'gs://bucket/recording.wav', + }, + }, + ]; + + const tokens = estimateTokenCountSync(parts); + // Default audio estimate: 3840 tokens (~2 min at 32 tokens/sec) + expect(tokens).toBe(3840); + }); + + it('should handle various audio MIME types', () => { + const mimeTypes = [ + 'audio/mpeg', + 'audio/wav', + 'audio/ogg', + 'audio/flac', + 'audio/aac', + 'audio/mp4', + ]; + + for (const mimeType of mimeTypes) { + const parts: Part[] = [ + { + fileData: { mimeType, fileUri: 'gs://bucket/file' }, + }, + ]; + // All should use the audio estimation path, not the JSON fallback + expect(estimateTokenCountSync(parts)).toBe(3840); + } + }); + + it('should return 0 tokens for empty base64 data', () => { + const parts: Part[] = [ + { inlineData: { mimeType: 'audio/mpeg', data: '' } }, + ]; + + const tokens = estimateTokenCountSync(parts); + // Empty data = 0 bytes = 0 seconds = 0 tokens + expect(tokens).toBe(0); + }); + + it('should estimate small audio clips with minimal tokens', () => { + // ~1 second of audio: 16,000 raw bytes -> base64 ≈ 21,333 chars + const base64Data = 'B'.repeat(21_333); + const parts: Part[] = [ + { inlineData: { mimeType: 'audio/mp4', data: base64Data } }, + ]; + + const tokens = estimateTokenCountSync(parts); + // rawBytes ≈ 16,000 -> duration ≈ 1s -> tokens = ceil(32) = 32 + expect(tokens).toBeGreaterThanOrEqual(30); + expect(tokens).toBeLessThanOrEqual(34); + }); + }); + + describe('video token estimation', () => { + it('should estimate video tokens from inlineData base64 size', () => { + // Simulate ~10 seconds of ~2Mbps video: + // 10s * 250,000 bytes/s = 2,500,000 raw bytes + // base64 length = 2,500,000 / 0.75 ≈ 3,333,333 chars + const base64Data = 'V'.repeat(3_333_333); + const parts: Part[] = [ + { inlineData: { mimeType: 'video/mp4', data: base64Data } }, + ]; + + const tokens = estimateTokenCountSync(parts); + // rawBytes ≈ 2,500,000 -> duration ≈ 10s + // tokens = ceil(10 * 290) = 2900 + expect(tokens).toBeGreaterThanOrEqual(2880); + expect(tokens).toBeLessThanOrEqual(2920); + }); + + it('should use default estimate for video fileData without base64', () => { + const parts: Part[] = [ + { + fileData: { + mimeType: 'video/mp4', + fileUri: 'gs://bucket/clip.mp4', + }, + }, + ]; + + const tokens = estimateTokenCountSync(parts); + // Default video estimate: 17,400 tokens (~1 min at 290 tokens/sec) + expect(tokens).toBe(17400); + }); + + it('should return 0 tokens for empty base64 video data', () => { + const parts: Part[] = [ + { inlineData: { mimeType: 'video/mp4', data: '' } }, + ]; + + const tokens = estimateTokenCountSync(parts); + // Empty data = 0 bytes = 0 seconds = 0 tokens + expect(tokens).toBe(0); + }); + + it('should handle various video MIME types', () => { + const mimeTypes = [ + 'video/mp4', + 'video/webm', + 'video/quicktime', + 'video/x-msvideo', + ]; + + for (const mimeType of mimeTypes) { + const parts: Part[] = [ + { + fileData: { mimeType, fileUri: 'gs://bucket/file' }, + }, + ]; + expect(estimateTokenCountSync(parts)).toBe(17400); + } + }); + }); + + describe('mixed multimodal content', () => { + it('should correctly sum tokens for text + audio + image parts', () => { + // "Describe this audio" = 19 ASCII chars -> 19 * 0.25 = 4.75 tokens + // audio fileData -> 3840 default tokens + // image -> 3000 tokens + const parts: Part[] = [ + { text: 'Describe this audio' }, + { + fileData: { + mimeType: 'audio/mpeg', + fileUri: 'gs://bucket/speech.mp3', + }, + }, + { inlineData: { mimeType: 'image/png', data: 'img_data' } }, + ]; + + const tokens = estimateTokenCountSync(parts); + // floor(4.75 + 3840 + 3000) = floor(6844.75) = 6844 + expect(tokens).toBe(6844); + }); + + it('should handle Gemini 3 nested audio parts in functionResponse', () => { + const parts: Part[] = [ + { + functionResponse: { + name: 'audio_tool', + id: '789', + response: { status: 'ok' }, + parts: [ + { + fileData: { + mimeType: 'audio/wav', + fileUri: 'gs://bucket/output.wav', + }, + }, + { text: 'Audio transcription here' }, + ] as Part[], + }, + }, + ]; + + const tokens = estimateTokenCountSync(parts); + // audio default 3840 + text 6 + response ~4 + name ~2 = ~3852 + expect(tokens).toBeGreaterThan(3840); + expect(tokens).toBeLessThan(3900); + }); + }); }); }); diff --git a/packages/core/src/utils/tokenCalculation.ts b/packages/core/src/utils/tokenCalculation.ts index d5a7fdc9eb9..61370ceef0a 100644 --- a/packages/core/src/utils/tokenCalculation.ts +++ b/packages/core/src/utils/tokenCalculation.ts @@ -20,6 +20,26 @@ const IMAGE_TOKEN_ESTIMATE = 3000; // See: https://ai.google.dev/gemini-api/docs/document-processing const PDF_TOKEN_ESTIMATE = 25800; +// Audio token estimation constants. +// The Gemini API tokenizes audio at ~32 tokens per second. +// See: https://ai.google.dev/gemini-api/docs/audio +const AUDIO_TOKENS_PER_SECOND = 32; +// Conservative bitrate for compressed audio duration estimation (128 kbps). +// Used to convert raw file size to an approximate duration. +const COMPRESSED_AUDIO_BYTES_PER_SECOND = 16_000; +// Default audio token estimate when base64 data is unavailable (~2 min). +const DEFAULT_AUDIO_TOKEN_ESTIMATE = 120 * AUDIO_TOKENS_PER_SECOND; + +// Video token estimation constants. +// Video frames are tokenized at 258 tokens/frame at 1 fps, plus the audio +// track at 32 tokens/second. +// See: https://ai.google.dev/gemini-api/docs/vision#video +const VIDEO_TOKENS_PER_SECOND = 258 + AUDIO_TOKENS_PER_SECOND; +// Conservative bitrate for compressed video duration estimation (~2 Mbps). +const COMPRESSED_VIDEO_BYTES_PER_SECOND = 250_000; +// Default video token estimate when base64 data is unavailable (~1 min). +const DEFAULT_VIDEO_TOKEN_ESTIMATE = 60 * VIDEO_TOKENS_PER_SECOND; + // Maximum number of characters to process with the full character-by-character heuristic. // Above this, we use a faster approximation to avoid performance bottlenecks. const MAX_CHARS_FOR_FULL_HEURISTIC = 100_000; @@ -50,7 +70,42 @@ function estimateTextTokens(text: string): number { } /** - * Heuristic estimation for media parts (images, PDFs) using fixed safe estimates. + * Estimates audio tokens from base64 data size. + * + * Converts the base64 length to a raw byte count, estimates the audio duration + * using a conservative compressed-audio bitrate, then multiplies by the Gemini + * API's audio token rate (32 tokens/second). + * + * When no base64 data is available (e.g. fileData references), returns a fixed + * default estimate. + */ +function estimateAudioTokens(base64Data: string | undefined): number { + if (base64Data === undefined) return DEFAULT_AUDIO_TOKEN_ESTIMATE; + const rawBytes = base64Data.length * 0.75; + const estimatedSeconds = rawBytes / COMPRESSED_AUDIO_BYTES_PER_SECOND; + return Math.ceil(estimatedSeconds * AUDIO_TOKENS_PER_SECOND); +} + +/** + * Estimates video tokens from base64 data size. + * + * Converts the base64 length to a raw byte count, estimates the video duration + * using a conservative compressed-video bitrate, then multiplies by the + * combined frame + audio token rate (258 + 32 = 290 tokens/second). + * + * When no base64 data is available (e.g. fileData references), returns a fixed + * default estimate. + */ +function estimateVideoTokens(base64Data: string | undefined): number { + if (base64Data === undefined) return DEFAULT_VIDEO_TOKEN_ESTIMATE; + const rawBytes = base64Data.length * 0.75; + const estimatedSeconds = rawBytes / COMPRESSED_VIDEO_BYTES_PER_SECOND; + return Math.ceil(estimatedSeconds * VIDEO_TOKENS_PER_SECOND); +} + +/** + * Heuristic estimation for media parts (images, PDFs, audio, video) using + * either fixed safe estimates or data-size-based duration estimation. */ function estimateMediaTokens(part: Part): number | undefined { const inlineData = 'inlineData' in part ? part.inlineData : undefined; @@ -65,6 +120,14 @@ function estimateMediaTokens(part: Part): number | undefined { // PDFs: 25,800 tokens (~100 pages at 258 tokens/page) // See: https://ai.google.dev/gemini-api/docs/document-processing return PDF_TOKEN_ESTIMATE; + } else if (mimeType?.startsWith('audio/')) { + // Audio: ~32 tokens per second of audio content. + // See: https://ai.google.dev/gemini-api/docs/audio + return estimateAudioTokens(inlineData?.data); + } else if (mimeType?.startsWith('video/')) { + // Video: 258 tokens/frame at 1 fps + 32 tokens/sec for the audio track. + // See: https://ai.google.dev/gemini-api/docs/vision#video + return estimateVideoTokens(inlineData?.data); } return undefined; }