Skip to content

Commit 81d5d73

Browse files
committed
Merge branch 'main' into brian/strict-oai-tool-schema
2 parents 8902c7f + b10503d commit 81d5d73

File tree

6 files changed

+331
-8
lines changed

6 files changed

+331
-8
lines changed

.changeset/long-cameras-throw.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@livekit/agents': patch
3+
---
4+
5+
Emit away events for User

agents/src/voice/agent_activity.ts

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -625,11 +625,21 @@ export class AgentActivity implements RecognitionHooks {
625625
return;
626626
}
627627

628+
// Refactored interruption word count check:
629+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
630+
// - Apply check to all STT results: empty string, undefined, or any length
631+
// - This ensures consistent behavior across all interruption scenarios
628632
if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
629633
const text = this.audioRecognition.currentTranscript;
630-
631634
// TODO(shubhra): better word splitting for multi-language
632-
if (text && splitWords(text, true).length < this.agentSession.options.minInterruptionWords) {
635+
636+
// Normalize text: convert undefined/null to empty string for consistent word counting
637+
const normalizedText = text ?? '';
638+
const wordCount = splitWords(normalizedText, true).length;
639+
640+
// Only allow interruption if word count meets or exceeds minInterruptionWords
641+
// This applies to all cases: empty strings, partial speech, and full speech
642+
if (wordCount < this.agentSession.options.minInterruptionWords) {
633643
return;
634644
}
635645
}
@@ -767,19 +777,30 @@ export class AgentActivity implements RecognitionHooks {
767777
return true;
768778
}
769779

780+
// Refactored interruption word count check for consistency with onVADInferenceDone:
781+
// - Always apply minInterruptionWords filtering when STT is available and minInterruptionWords > 0
782+
// - Use consistent word splitting logic with splitWords (matching onVADInferenceDone pattern)
770783
if (
771784
this.stt &&
772785
this.turnDetection !== 'manual' &&
773786
this._currentSpeech &&
774787
this._currentSpeech.allowInterruptions &&
775788
!this._currentSpeech.interrupted &&
776-
this.agentSession.options.minInterruptionWords > 0 &&
777-
info.newTranscript.split(' ').length < this.agentSession.options.minInterruptionWords
789+
this.agentSession.options.minInterruptionWords > 0
778790
) {
779-
// avoid interruption if the new_transcript is too short
780-
this.cancelPreemptiveGeneration();
781-
this.logger.info('skipping user input, new_transcript is too short');
782-
return false;
791+
const wordCount = splitWords(info.newTranscript, true).length;
792+
if (wordCount < this.agentSession.options.minInterruptionWords) {
793+
// avoid interruption if the new_transcript contains fewer words than minInterruptionWords
794+
this.cancelPreemptiveGeneration();
795+
this.logger.info(
796+
{
797+
wordCount,
798+
minInterruptionWords: this.agentSession.options.minInterruptionWords,
799+
},
800+
'skipping user input, word count below minimum interruption threshold',
801+
);
802+
return false;
803+
}
783804
}
784805

785806
const oldTask = this._userTurnCompletedTask;

agents/src/voice/agent_session.ts

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ export interface VoiceOptions {
5858
maxEndpointingDelay: number;
5959
maxToolSteps: number;
6060
preemptiveGeneration: boolean;
61+
userAwayTimeout?: number | null;
6162
}
6263

6364
const defaultVoiceOptions: VoiceOptions = {
@@ -69,6 +70,7 @@ const defaultVoiceOptions: VoiceOptions = {
6970
maxEndpointingDelay: 6000,
7071
maxToolSteps: 3,
7172
preemptiveGeneration: false,
73+
userAwayTimeout: 15.0,
7274
} as const;
7375

7476
export type TurnDetectionMode = 'stt' | 'vad' | 'realtime_llm' | 'manual' | _TurnDetector;
@@ -123,6 +125,7 @@ export class AgentSession<
123125
private _output: AgentOutput;
124126

125127
private closingTask: Promise<void> | null = null;
128+
private userAwayTimer: NodeJS.Timeout | null = null;
126129

127130
constructor(opts: AgentSessionOptions<UserData>) {
128131
super();
@@ -167,6 +170,8 @@ export class AgentSession<
167170
// This is the "global" chat context, it holds the entire conversation history
168171
this._chatCtx = ChatContext.empty();
169172
this.options = { ...defaultVoiceOptions, ...voiceOptions };
173+
174+
this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed.bind(this));
170175
}
171176

172177
get input(): AgentInput {
@@ -416,6 +421,14 @@ export class AgentSession<
416421

417422
const oldState = this._agentState;
418423
this._agentState = state;
424+
425+
// Handle user away timer based on state changes
426+
if (state === 'listening' && this.userState === 'listening') {
427+
this._setUserAwayTimer();
428+
} else {
429+
this._cancelUserAwayTimer();
430+
}
431+
419432
this.emit(
420433
AgentSessionEventTypes.AgentStateChanged,
421434
createAgentStateChangedEvent(oldState, state),
@@ -430,6 +443,14 @@ export class AgentSession<
430443

431444
const oldState = this.userState;
432445
this.userState = state;
446+
447+
// Handle user away timer based on state changes
448+
if (state === 'listening' && this._agentState === 'listening') {
449+
this._setUserAwayTimer();
450+
} else {
451+
this._cancelUserAwayTimer();
452+
}
453+
433454
this.emit(
434455
AgentSessionEventTypes.UserStateChanged,
435456
createUserStateChangedEvent(oldState, state),
@@ -451,6 +472,37 @@ export class AgentSession<
451472

452473
private onTextOutputChanged(): void {}
453474

475+
private _setUserAwayTimer(): void {
476+
this._cancelUserAwayTimer();
477+
478+
if (this.options.userAwayTimeout === null || this.options.userAwayTimeout === undefined) {
479+
return;
480+
}
481+
482+
if (this.roomIO && !this.roomIO.isParticipantAvailable) {
483+
return;
484+
}
485+
486+
this.userAwayTimer = setTimeout(() => {
487+
this.logger.debug('User away timeout triggered');
488+
this._updateUserState('away');
489+
}, this.options.userAwayTimeout * 1000);
490+
}
491+
492+
private _cancelUserAwayTimer(): void {
493+
if (this.userAwayTimer !== null) {
494+
clearTimeout(this.userAwayTimer);
495+
this.userAwayTimer = null;
496+
}
497+
}
498+
499+
private _onUserInputTranscribed(ev: UserInputTranscribedEvent): void {
500+
if (this.userState === 'away' && ev.isFinal) {
501+
this.logger.debug('User returned from away state due to speech input');
502+
this._updateUserState('listening');
503+
}
504+
}
505+
454506
private async closeImpl(
455507
reason: CloseReason,
456508
error: RealtimeModelError | LLMError | TTSError | STTError | null = null,
@@ -460,6 +512,8 @@ export class AgentSession<
460512
return;
461513
}
462514

515+
this._cancelUserAwayTimer();
516+
463517
if (this.activity) {
464518
if (!drain) {
465519
try {
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2+
//
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
/**
6+
* Unit tests for interruption detection logic in AgentActivity.
7+
*
8+
* Tests the refactored minInterruptionWords check which ensures:
9+
* - Consistent word count filtering across all speech scenarios
10+
* - Proper handling of empty strings, undefined, and short speech
11+
* - Interruptions allowed only when word count meets or exceeds minInterruptionWords threshold
12+
*/
13+
import { describe, expect, it } from 'vitest';
14+
import { splitWords } from '../tokenize/basic/word.js';
15+
16+
describe('Interruption Detection - Word Counting', () => {
17+
describe('Word Splitting Behavior', () => {
18+
it('should count empty string as 0 words', () => {
19+
const text = '';
20+
const wordCount = splitWords(text, true).length;
21+
expect(wordCount).toBe(0);
22+
});
23+
24+
it('should count single word correctly', () => {
25+
const text = 'hello';
26+
const wordCount = splitWords(text, true).length;
27+
expect(wordCount).toBe(1);
28+
});
29+
30+
it('should count two words correctly', () => {
31+
const text = 'hello world';
32+
const wordCount = splitWords(text, true).length;
33+
expect(wordCount).toBe(2);
34+
});
35+
36+
it('should count multiple words correctly', () => {
37+
const text = 'hello this is a full sentence';
38+
const wordCount = splitWords(text, true).length;
39+
expect(wordCount).toBe(6);
40+
});
41+
42+
it('should handle punctuation correctly', () => {
43+
const text = 'hello, world!';
44+
const wordCount = splitWords(text, true).length;
45+
expect(wordCount).toBe(2);
46+
});
47+
48+
it('should handle multiple spaces between words', () => {
49+
const text = 'hello world';
50+
const wordCount = splitWords(text, true).length;
51+
expect(wordCount).toBe(2);
52+
});
53+
54+
it('should count whitespace-only string as 0 words', () => {
55+
const text = ' ';
56+
const wordCount = splitWords(text, true).length;
57+
expect(wordCount).toBe(0);
58+
});
59+
60+
it('should handle leading and trailing whitespace', () => {
61+
const text = ' hello world ';
62+
const wordCount = splitWords(text, true).length;
63+
expect(wordCount).toBe(2);
64+
});
65+
});
66+
67+
describe('Integration: Full Interruption Check Logic', () => {
68+
it('should block interruption for empty transcript with threshold 2', () => {
69+
const text = '';
70+
const minInterruptionWords = 2;
71+
72+
const normalizedText = text ?? '';
73+
const wordCount = splitWords(normalizedText, true).length;
74+
const shouldBlock = wordCount < minInterruptionWords;
75+
76+
expect(normalizedText).toBe('');
77+
expect(wordCount).toBe(0);
78+
expect(shouldBlock).toBe(true);
79+
});
80+
81+
it('should block interruption for undefined transcript with threshold 2', () => {
82+
const text: string | undefined = undefined;
83+
const minInterruptionWords = 2;
84+
85+
const normalizedText = text ?? '';
86+
const wordCount = splitWords(normalizedText, true).length;
87+
const shouldBlock = wordCount < minInterruptionWords;
88+
89+
expect(normalizedText).toBe('');
90+
expect(wordCount).toBe(0);
91+
expect(shouldBlock).toBe(true);
92+
});
93+
94+
it('should block interruption for single word with threshold 2', () => {
95+
const text = 'hello';
96+
const minInterruptionWords = 2;
97+
98+
const normalizedText = text ?? '';
99+
const wordCount = splitWords(normalizedText, true).length;
100+
const shouldBlock = wordCount < minInterruptionWords;
101+
102+
expect(normalizedText).toBe('hello');
103+
expect(wordCount).toBe(1);
104+
expect(shouldBlock).toBe(true);
105+
});
106+
107+
it('should allow interruption when word count exactly meets threshold', () => {
108+
const text = 'hello world';
109+
const minInterruptionWords = 2;
110+
111+
const normalizedText = text ?? '';
112+
const wordCount = splitWords(normalizedText, true).length;
113+
const shouldBlock = wordCount < minInterruptionWords;
114+
115+
expect(normalizedText).toBe('hello world');
116+
expect(wordCount).toBe(2);
117+
expect(shouldBlock).toBe(false);
118+
});
119+
120+
it('should allow interruption when word count exceeds threshold', () => {
121+
const text = 'hello this is a full sentence';
122+
const minInterruptionWords = 2;
123+
124+
const normalizedText = text ?? '';
125+
const wordCount = splitWords(normalizedText, true).length;
126+
const shouldBlock = wordCount < minInterruptionWords;
127+
128+
expect(normalizedText).toBe('hello this is a full sentence');
129+
expect(wordCount).toBe(6);
130+
expect(shouldBlock).toBe(false);
131+
});
132+
133+
it('should apply consistent word counting logic in both methods', () => {
134+
const transcripts = ['', 'hello', 'hello world', 'this is a longer sentence'];
135+
const threshold = 2;
136+
137+
transcripts.forEach((transcript) => {
138+
const text1 = transcript;
139+
const normalizedText1 = text1 ?? '';
140+
const wordCount1 = splitWords(normalizedText1, true).length;
141+
const shouldBlock1 = wordCount1 < threshold;
142+
143+
const wordCount2 = splitWords(transcript, true).length;
144+
const shouldBlock2 = wordCount2 < threshold;
145+
146+
expect(wordCount1).toBe(wordCount2);
147+
expect(shouldBlock1).toBe(shouldBlock2);
148+
});
149+
});
150+
});
151+
});

agents/src/voice/room_io/room_io.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,10 @@ export class RoomIO {
369369
return this.transcriptionSynchronizer.textOutput;
370370
}
371371

372+
get isParticipantAvailable(): boolean {
373+
return this.participantAvailableFuture.done;
374+
}
375+
372376
/** Switch to a different participant */
373377
setParticipant(participantIdentity: string | null) {
374378
this.logger.debug({ participantIdentity }, 'setting participant');

0 commit comments

Comments
 (0)