diff --git a/src/lib/realtime-voice-gateway-relay.test.ts b/src/lib/realtime-voice-gateway-relay.test.ts index e23e7d2..a5bcbcc 100644 --- a/src/lib/realtime-voice-gateway-relay.test.ts +++ b/src/lib/realtime-voice-gateway-relay.test.ts @@ -16,11 +16,11 @@ function inputWithLevel(level: number) { } describe("realtime gateway relay barge-in detection", () => { - it("keeps the existing desktop sensitivity", () => { + it("requires sustained desktop speech before barge-in", () => { let speechFrames = 0; for (let i = 0; i < DESKTOP_REALTIME_BARGE_IN_PROFILE.frames; i += 1) { const result = detectRealtimeBargeIn({ - input: inputWithLevel(0.09), + input: inputWithLevel(0.11), activeOutput: true, cancelRequested: false, speechFrames, @@ -53,12 +53,12 @@ describe("realtime gateway relay barge-in detection", () => { let speechFrames = 0; for (let i = 0; i < MOBILE_REALTIME_BARGE_IN_PROFILE.frames - 1; i += 1) { const result = detectRealtimeBargeIn({ - input: inputWithLevel(0.17), + input: inputWithLevel(0.23), activeOutput: true, cancelRequested: false, speechFrames, outputStartedAtMs: 1_000, - nowMs: 2_000, + nowMs: 2_500, profile: MOBILE_REALTIME_BARGE_IN_PROFILE, }); speechFrames = result.speechFrames; @@ -66,12 +66,12 @@ describe("realtime gateway relay barge-in detection", () => { } const result = detectRealtimeBargeIn({ - input: inputWithLevel(0.17), + input: inputWithLevel(0.23), activeOutput: true, cancelRequested: false, speechFrames, outputStartedAtMs: 1_000, - nowMs: 2_000, + nowMs: 2_500, profile: MOBILE_REALTIME_BARGE_IN_PROFILE, }); @@ -79,6 +79,24 @@ describe("realtime gateway relay barge-in detection", () => { expect(result.suppressInput).toBe(false); }); + it("does not interrupt mobile output for short speech bursts", () => { + let speechFrames = 0; + for (let i = 0; i < MOBILE_REALTIME_BARGE_IN_PROFILE.frames - 2; i += 1) { + const result = detectRealtimeBargeIn({ + input: inputWithLevel(0.3), + activeOutput: true, + cancelRequested: false, + speechFrames, + outputStartedAtMs: 1_000, + nowMs: 2_500, + profile: MOBILE_REALTIME_BARGE_IN_PROFILE, + }); + speechFrames = result.speechFrames; + expect(result.triggered).toBe(false); + expect(result.suppressInput).toBe(true); + } + }); + it("suppresses mobile playback echo until barge-in is confirmed", () => { const echo = detectRealtimeBargeIn({ input: inputWithLevel(0.08), diff --git a/src/lib/realtime-voice-gateway-relay.ts b/src/lib/realtime-voice-gateway-relay.ts index dd489d7..b319e44 100644 --- a/src/lib/realtime-voice-gateway-relay.ts +++ b/src/lib/realtime-voice-gateway-relay.ts @@ -13,13 +13,13 @@ import { base64ToBytes, bytesToBase64, floatToPcm16, pcm16ToFloat, rmsLevel } fr export type RealtimeVoiceStatus = "idle" | "listening" | "processing" | "speaking" | "error"; -const BARGE_IN_RMS_THRESHOLD = 0.02; -const BARGE_IN_PEAK_THRESHOLD = 0.08; -const BARGE_IN_FRAMES = 2; -const MOBILE_BARGE_IN_RMS_THRESHOLD = 0.055; -const MOBILE_BARGE_IN_PEAK_THRESHOLD = 0.16; -const MOBILE_BARGE_IN_FRAMES = 4; -const MOBILE_BARGE_IN_GRACE_MS = 750; +const BARGE_IN_RMS_THRESHOLD = 0.03; +const BARGE_IN_PEAK_THRESHOLD = 0.1; +const BARGE_IN_FRAMES = 3; +const MOBILE_BARGE_IN_RMS_THRESHOLD = 0.075; +const MOBILE_BARGE_IN_PEAK_THRESHOLD = 0.22; +const MOBILE_BARGE_IN_FRAMES = 7; +const MOBILE_BARGE_IN_GRACE_MS = 1200; const REALTIME_VOICE_CONTEXT_LIMIT = 8; export interface RealtimeBargeInProfile {