diff --git a/src/app/api/tts/voices/route.ts b/src/app/api/tts/voices/route.ts index f548771d..e01c55d9 100644 --- a/src/app/api/tts/voices/route.ts +++ b/src/app/api/tts/voices/route.ts @@ -2,7 +2,7 @@ import { NextRequest, NextResponse } from "next/server"; import { execFile } from "node:child_process"; import { promisify } from "node:util"; import { requireAuth } from "@/lib/require-auth"; -import { OPENAI_TTS_VOICES, type TtsVoiceOption, type TtsProviderId } from "@/lib/tts-voices"; +import { GOOGLE_REALTIME_VOICES, OPENAI_TTS_VOICES, type TtsVoiceOption, type TtsProviderId } from "@/lib/tts-voices"; export const dynamic = "force-dynamic"; @@ -19,7 +19,7 @@ export async function GET(request: NextRequest) { const provider = normalizeProvider(request.nextUrl.searchParams.get("provider")); const query = (request.nextUrl.searchParams.get("q") || "").trim().toLowerCase(); - const providers: TtsProviderId[] = provider === "all" ? ["openai", "elevenlabs", "say", "browser"] : [provider]; + const providers: TtsProviderId[] = provider === "all" ? ["openai", "google", "elevenlabs", "say", "browser"] : [provider]; const settled = await Promise.allSettled(providers.map((p) => listProviderVoices(p))); const voices = settled.flatMap((result) => result.status === "fulfilled" ? result.value : []); const filtered = query @@ -34,7 +34,7 @@ export async function GET(request: NextRequest) { } function normalizeProvider(value: string | null): ProviderFilter { - if (value === "openai" || value === "elevenlabs" || value === "say" || value === "browser") return value; + if (value === "openai" || value === "google" || value === "elevenlabs" || value === "say" || value === "browser") return value; return "all"; } @@ -42,6 +42,8 @@ async function listProviderVoices(provider: TtsProviderId): Promise { }); }); + it("maps Google voice selections to realtime session settings", () => { + expect(resolveRealtimeVoiceSessionSettings({ + enabled: true, + provider: "google", + voiceId: "Kore", + model: "gemini-2.5-flash-native-audio-preview-12-2025", + })).toEqual({ + provider: "google", + voice: "Kore", + model: "gemini-2.5-flash-native-audio-preview-12-2025", + }); + }); + it("does not forward non-realtime TTS voice settings", () => { expect(resolveRealtimeVoiceSessionSettings({ enabled: true, diff --git a/src/lib/realtime-voice-client.ts b/src/lib/realtime-voice-client.ts index 93802ae9..e359cc9e 100644 --- a/src/lib/realtime-voice-client.ts +++ b/src/lib/realtime-voice-client.ts @@ -1,4 +1,5 @@ import { + GOOGLE_REALTIME_VOICE_IDS, OPENAI_REALTIME_VOICE_IDS, normalizeAgentVoiceSettings, type AgentVoiceSettings, @@ -90,10 +91,20 @@ export function resolveRealtimeVoiceSessionSettings( voiceSettings?: AgentVoiceSettings | null, ): Pick { const voice = normalizeAgentVoiceSettings(voiceSettings); - if (voice.enabled === false || voice.provider !== "openai") return {}; + if (voice.enabled === false) return {}; - const voiceId = voice.voiceId?.trim().toLowerCase(); + const rawVoiceId = voice.voiceId?.trim(); + const voiceId = rawVoiceId?.toLowerCase(); const model = voice.model?.trim(); + if (voice.provider === "google") { + return { + provider: "google", + voice: voiceId && GOOGLE_REALTIME_VOICE_IDS.has(voiceId) ? rawVoiceId : undefined, + model: model?.includes("native-audio") || model?.includes("live") ? model : undefined, + }; + } + if (voice.provider !== "openai") return {}; + return { provider: "openai", voice: voiceId && OPENAI_REALTIME_VOICE_IDS.has(voiceId) ? voiceId : undefined, diff --git a/src/lib/tts-voices.ts b/src/lib/tts-voices.ts index a9518737..d4f4ab7f 100644 --- a/src/lib/tts-voices.ts +++ b/src/lib/tts-voices.ts @@ -1,4 +1,4 @@ -export type TtsProviderId = "openai" | "elevenlabs" | "say" | "browser"; +export type TtsProviderId = "openai" | "google" | "elevenlabs" | "say" | "browser"; export interface TtsVoiceOption { id: string; @@ -22,6 +22,7 @@ export interface AgentVoiceSettings { export const TTS_PROVIDER_OPTIONS: Array<{ value: TtsProviderId | "auto"; label: string; description: string }> = [ { value: "auto", label: "Auto", description: "Use the best available device or configured backend voice" }, { value: "openai", label: "OpenAI", description: "Cloud neural voices" }, + { value: "google", label: "Google", description: "Gemini realtime voices" }, { value: "elevenlabs", label: "ElevenLabs", description: "Large voice library when configured" }, { value: "say", label: "macOS say", description: "Local system voices" }, { value: "browser", label: "Browser", description: "Web Speech voices on this device" }, @@ -55,6 +56,15 @@ export const OPENAI_REALTIME_VOICE_IDS = new Set([ "verse", ]); +export const GOOGLE_REALTIME_VOICES: TtsVoiceOption[] = [ + { id: "Kore", name: "Kore", provider: "google", description: "Gemini realtime default" }, + { id: "Puck", name: "Puck", provider: "google", description: "Gemini realtime voice" }, +]; + +export const GOOGLE_REALTIME_VOICE_IDS = new Set( + GOOGLE_REALTIME_VOICES.map((voice) => voice.id.toLowerCase()), +); + export const DEFAULT_AGENT_VOICE_SETTINGS: AgentVoiceSettings = { enabled: true, provider: "auto", @@ -104,5 +114,9 @@ export function shouldUseDeviceTts(voice: AgentVoiceSettings) { } export function isRealtimeVoiceOption(voice: Pick) { - return voice.provider === "openai" && OPENAI_REALTIME_VOICE_IDS.has(voice.id.trim().toLowerCase()); + const voiceId = voice.id.trim().toLowerCase(); + return ( + (voice.provider === "openai" && OPENAI_REALTIME_VOICE_IDS.has(voiceId)) || + (voice.provider === "google" && GOOGLE_REALTIME_VOICE_IDS.has(voiceId)) + ); }