@@ -330,7 +341,14 @@ function ToolbarRightSection({
{planModeEnabled && !isAgentBusy && onImplementPlan && (
+
+ {onVoiceTranscript && (
+
+ )}
);
diff --git a/apps/web/components/task/chat/voice-input-button.tsx b/apps/web/components/task/chat/voice-input-button.tsx
new file mode 100644
index 000000000..978bfebda
--- /dev/null
+++ b/apps/web/components/task/chat/voice-input-button.tsx
@@ -0,0 +1,265 @@
+"use client";
+
+import { useCallback, useEffect, useRef } from "react";
+import { IconLoader2, IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
+
+import { Button } from "@kandev/ui/button";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@kandev/ui/tooltip";
+import { cn } from "@/lib/utils";
+import {
+ useVoiceInput,
+ type VoiceError,
+ type VoiceInputState,
+ type VoiceModelLoadState,
+} from "@/hooks/use-voice-input";
+import { useAppStore } from "@/components/state-provider";
+import { useKeyboardShortcut } from "@/hooks/use-keyboard-shortcut";
+import { useToast } from "@/components/toast-provider";
+import { getShortcut } from "@/lib/keyboard/shortcut-overrides";
+
+type VoiceInputButtonProps = {
+ /** Inserts the recognized transcript at the current cursor position. */
+ onTranscript: (text: string) => void;
+ /** Called after a non-empty transcript was inserted, when auto-send is enabled. */
+ onAutoSend?: () => void;
+ /** Disable while the chat input itself is disabled (sending / starting / failed). */
+ disabled?: boolean;
+};
+
+const TOOLTIP_BY_STATE: Record
= {
+ idle: "Voice input",
+ requesting: "Requesting microphone…",
+ recording: "Stop recording",
+ processing: "Transcribing…",
+};
+
+const ARIA_BY_STATE: Record = {
+ idle: "Start voice input",
+ requesting: "Requesting microphone permission",
+ recording: "Stop voice input",
+ processing: "Transcribing voice input",
+};
+
+function ButtonIcon({
+ state,
+ modelLoad,
+}: {
+ state: VoiceInputState;
+ modelLoad: VoiceModelLoadState;
+}) {
+ if (state === "processing" || state === "requesting" || modelLoad.state === "loading") {
+ return ;
+ }
+ if (state === "recording") {
+ return ;
+ }
+ return ;
+}
+
+function toastForError(toast: ReturnType["toast"], err: VoiceError) {
+ if (err.code === "no-speech") {
+ toast({ title: err.message });
+ return;
+ }
+ toast({ title: err.message, variant: "error" });
+}
+
+// ── Activation handlers ──────────────────────────────────────────────────
+
+function buildHoldHandlers(start: () => Promise, stop: () => Promise) {
+ return {
+ onPointerDown: (e: React.PointerEvent) => {
+ e.preventDefault();
+ void start();
+ },
+ onPointerUp: (e: React.PointerEvent) => {
+ e.preventDefault();
+ void stop();
+ },
+ onPointerLeave: () => void stop(),
+ onPointerCancel: () => void stop(),
+ };
+}
+
+function buildToggleHandler(
+ state: VoiceInputState,
+ start: () => Promise,
+ stop: () => Promise,
+) {
+ return () => {
+ if (state === "idle") void start();
+ else if (state === "recording") void stop();
+ };
+}
+
+// ── Hook composition ─────────────────────────────────────────────────────
+
+function useAutoSendOnTranscript(
+ baseOnTranscript: (text: string) => void,
+ onAutoSend: (() => void) | undefined,
+ enabled: boolean,
+) {
+ // Wrap onTranscript so we can defer auto-send until after the transcript
+ // has been inserted. requestAnimationFrame keeps a clean separation between
+ // the editor update and the submit handler, so the editor's onChange has
+ // already flushed when submit reads from it.
+ return useCallback(
+ (text: string) => {
+ baseOnTranscript(text);
+ if (enabled && onAutoSend) requestAnimationFrame(onAutoSend);
+ },
+ [baseOnTranscript, onAutoSend, enabled],
+ );
+}
+
+function useVoiceShortcut(
+ enabled: boolean,
+ state: VoiceInputState,
+ start: () => Promise,
+ stop: () => Promise,
+) {
+ const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts);
+ const shortcut = getShortcut("VOICE_INPUT_TOGGLE", overrides);
+ const stateRef = useRef(state);
+ useEffect(() => {
+ stateRef.current = state;
+ }, [state]);
+ const handler = useCallback(() => {
+ if (stateRef.current === "idle") void start();
+ else if (stateRef.current === "recording") void stop();
+ }, [start, stop]);
+ useKeyboardShortcut(shortcut, handler, { enabled });
+}
+
+// ── Unsupported fallback ────────────────────────────────────────────────
+
+function buildUnsupportedReason(): string {
+ if (typeof window === "undefined") return "Voice input is unavailable here.";
+ if (!window.isSecureContext) {
+ return "Voice input needs HTTPS. Open this site over https:// (or http://localhost) — most mobile browsers block microphone APIs on insecure origins.";
+ }
+ return "Voice input isn't supported in this browser. Try Chrome, Edge, or Safari 14.5+.";
+}
+
+function UnsupportedVoiceButton({ disabled }: { disabled?: boolean }) {
+ const { toast } = useToast();
+ const handleClick = () => {
+ toast({
+ title: "Voice input unavailable",
+ description: buildUnsupportedReason(),
+ variant: "error",
+ });
+ };
+ return (
+
+
+
+
+
+
+ Voice input unavailable — tap for details
+
+ );
+}
+
+// ── Component ────────────────────────────────────────────────────────────
+
+export function VoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) {
+ const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled);
+ // Render nothing — including no hook subscriptions — when the user has
+ // disabled the feature in settings. Distinct from `!supported` (browser
+ // limitation) which shows a tappable greyed icon. Done as a sub-component
+ // so the unconditional hook count stays the same in the active path.
+ if (!enabled) return null;
+ return (
+
+ );
+}
+
+function EnabledVoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) {
+ const { toast } = useToast();
+ const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+ const handleError = useCallback((err: VoiceError) => toastForError(toast, err), [toast]);
+ const wrappedTranscript = useAutoSendOnTranscript(onTranscript, onAutoSend, voiceMode.autoSend);
+
+ const { supported, state, modelLoad, start, stop, cancel } = useVoiceInput({
+ onTranscript: wrappedTranscript,
+ onError: handleError,
+ });
+
+ // If the chat input gets disabled mid-recording, cancel rather than leave
+ // the mic indicator on. Hold-mode pointerup may not fire if focus moves.
+ useEffect(() => {
+ if (disabled && (state === "recording" || state === "requesting")) cancel();
+ }, [disabled, state, cancel]);
+
+ useVoiceShortcut(supported && !disabled, state, start, stop);
+
+ // Always render the button — even when unsupported — so users can see it on
+ // mobile and tap to learn why voice input isn't working (usually a missing
+ // secure context, e.g. when reaching the dev server over LAN HTTP). Hiding
+ // the button silently left mobile users with no discoverable feedback.
+ if (!supported) return ;
+
+ const isRecording = state === "recording";
+ const isBusy = state === "requesting" || state === "processing" || modelLoad.state === "loading";
+ const holdMode = voiceMode.mode === "hold";
+
+ const pointerHandlers = holdMode ? buildHoldHandlers(start, stop) : {};
+ const onClick = holdMode ? undefined : buildToggleHandler(state, start, stop);
+
+ // Styled to mirror SubmitButton (h-7 w-7 rounded-full primary fill) so the
+ // two prominent input actions read as a pair on the right of the toolbar.
+ // Recording flips to a destructive fill with a pulsing ring so the active
+ // state is unmistakable even on mobile.
+ return (
+
+
+
+
+ {isRecording && (
+
+ )}
+
+
+
+ {modelLoad.state === "loading"
+ ? `Loading model… ${Math.round(modelLoad.progress * 100)}%`
+ : `${TOOLTIP_BY_STATE[state]}${holdMode && state === "idle" ? " (hold)" : ""}`}
+
+
+ );
+}
diff --git a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts
index 3722d205b..fdbcbfb42 100644
--- a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts
+++ b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts
@@ -95,8 +95,10 @@ test.describe("Toolbar overflow menu", () => {
// Context badge should be hidden when collapsed to avoid clipping
await expect(contextBadge).not.toBeVisible();
- // Submit button should remain visible (always-visible item)
- const submitBtn = toolbar.locator("button.rounded-full");
+ // Submit button should remain visible (always-visible item). Target the
+ // submit testid specifically — the voice input button is also round, so a
+ // bare `button.rounded-full` locator now matches both and fails strict mode.
+ const submitBtn = toolbar.getByTestId("submit-message-button");
await expect(submitBtn).toBeVisible();
// Click expand toggle — items appear inline (scrollable)
diff --git a/apps/web/hooks/use-user-display-settings.ts b/apps/web/hooks/use-user-display-settings.ts
index 250e2bac2..c06dfb5c2 100644
--- a/apps/web/hooks/use-user-display-settings.ts
+++ b/apps/web/hooks/use-user-display-settings.ts
@@ -6,7 +6,10 @@ import { useAppStore } from "@/components/state-provider";
import { useRepositories } from "@/hooks/domains/workspace/use-repositories";
import { mapUserSettingsResponse } from "@/lib/ssr/user-settings";
import { repositoryId, type Repository } from "@/lib/types/http";
-import type { UserSettingsState } from "@/lib/state/slices/settings/types";
+import {
+ DEFAULT_VOICE_MODE_STATE,
+ type UserSettingsState,
+} from "@/lib/state/slices/settings/types";
type DisplaySettings = UserSettingsState;
@@ -36,7 +39,15 @@ function carryForwardTerminalSettings(current: DisplaySettings) {
};
}
-function carryForwardSettings(current: DisplaySettings) {
+function carryForwardLspSettings(current: DisplaySettings) {
+ return {
+ lspAutoStartLanguages: current.lspAutoStartLanguages ?? [],
+ lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [],
+ lspServerConfigs: current.lspServerConfigs ?? {},
+ };
+}
+
+function carryForwardCoreSettings(current: DisplaySettings) {
return {
shellOptions: current.shellOptions ?? [],
defaultEditorId: current.defaultEditorId ?? null,
@@ -44,14 +55,19 @@ function carryForwardSettings(current: DisplaySettings) {
reviewAutoMarkOnScroll: current.reviewAutoMarkOnScroll ?? true,
showReleaseNotification: current.showReleaseNotification ?? true,
releaseNotesLastSeenVersion: current.releaseNotesLastSeenVersion ?? null,
- lspAutoStartLanguages: current.lspAutoStartLanguages ?? [],
- lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [],
- lspServerConfigs: current.lspServerConfigs ?? {},
savedLayouts: current.savedLayouts ?? [],
sidebarViews: current.sidebarViews ?? [],
defaultUtilityAgentId: current.defaultUtilityAgentId ?? null,
keyboardShortcuts: current.keyboardShortcuts ?? {},
changesPanelLayout: current.changesPanelLayout ?? "flat",
+ voiceMode: current.voiceMode ?? { ...DEFAULT_VOICE_MODE_STATE },
+ };
+}
+
+function carryForwardSettings(current: DisplaySettings) {
+ return {
+ ...carryForwardCoreSettings(current),
+ ...carryForwardLspSettings(current),
...carryForwardTerminalSettings(current),
};
}
diff --git a/apps/web/hooks/use-voice-input.test.ts b/apps/web/hooks/use-voice-input.test.ts
new file mode 100644
index 000000000..3137cceba
--- /dev/null
+++ b/apps/web/hooks/use-voice-input.test.ts
@@ -0,0 +1,199 @@
+import { act, renderHook, waitFor } from "@testing-library/react";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+// ── Hoisted mocks (defined before the modules they replace are evaluated) ──
+
+const voicePrefs = vi.hoisted(() => ({
+ value: {
+ engine: "auto" as "auto" | "webSpeech" | "whisperWeb" | "whisperServer",
+ language: "auto",
+ mode: "toggle" as "toggle" | "hold",
+ autoSend: false,
+ whisperWebModel: "base" as "tiny" | "base" | "small",
+ },
+}));
+
+vi.mock("@/components/state-provider", () => ({
+ useAppStore: (
+ selector: (state: { userSettings: { voiceMode: typeof voicePrefs.value } }) => unknown,
+ ) => selector({ userSettings: { voiceMode: voicePrefs.value } }),
+}));
+
+const transcribeAudio = vi.hoisted(() => vi.fn());
+vi.mock("@/lib/api/domains/voice-api", () => ({ transcribeAudio }));
+
+// ── Mock SpeechRecognition ─────────────────────────────────────────────
+
+type SpeechHandle = {
+ start: () => void;
+ stop: () => void;
+ abort: () => void;
+ onresult: ((ev: { resultIndex: number; results: unknown }) => void) | null;
+ onerror: ((ev: { error: string }) => void) | null;
+ onend: (() => void) | null;
+ continuous: boolean;
+ interimResults: boolean;
+ maxAlternatives: number;
+ lang: string;
+ startCalls: number;
+ stopCalls: number;
+ abortCalls: number;
+};
+
+let recognitionInstance: SpeechHandle | null = null;
+
+// Factory pattern instead of `class` so we can avoid aliasing `this` in the
+// constructor (the lint rule disallows it) while still satisfying the
+// `new ()` shape that useVoiceInput's `new Ctor()` calls.
+function FakeSpeechRecognition() {
+ const handle: SpeechHandle = {
+ continuous: false,
+ interimResults: false,
+ maxAlternatives: 1,
+ lang: "",
+ onresult: null,
+ onerror: null,
+ onend: null,
+ startCalls: 0,
+ stopCalls: 0,
+ abortCalls: 0,
+ start() {
+ handle.startCalls += 1;
+ },
+ stop() {
+ handle.stopCalls += 1;
+ },
+ abort() {
+ handle.abortCalls += 1;
+ },
+ };
+ recognitionInstance = handle;
+ return handle;
+}
+
+// Import after mocks so the module under test sees the mocked store.
+import { useVoiceInput } from "./use-voice-input";
+
+// ── Tests ───────────────────────────────────────────────────────────────
+
+beforeEach(() => {
+ voicePrefs.value = {
+ engine: "auto",
+ language: "auto",
+ mode: "toggle",
+ autoSend: false,
+ whisperWebModel: "base",
+ };
+ recognitionInstance = null;
+ transcribeAudio.mockReset();
+ (window as unknown as { SpeechRecognition: unknown }).SpeechRecognition =
+ FakeSpeechRecognition as unknown as new () => SpeechHandle;
+ // MediaRecorder/getUserMedia not used in the auto→webSpeech path, but provide
+ // a stub so capability detection sees audioCapture available too.
+ (window as unknown as { MediaRecorder: { isTypeSupported: () => boolean } }).MediaRecorder = {
+ isTypeSupported: () => true,
+ };
+ Object.defineProperty(global.navigator, "mediaDevices", {
+ value: { getUserMedia: vi.fn() },
+ configurable: true,
+ });
+});
+
+afterEach(() => {
+ delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition;
+ delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition;
+ delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder;
+});
+
+describe("useVoiceInput — Web Speech engine", () => {
+ it("reports supported and resolves engine = webSpeech under the default auto preference", () => {
+ const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+ expect(result.current.supported).toBe(true);
+ expect(result.current.engine).toBe("webSpeech");
+ });
+
+ it("transitions idle → recording on start() and emits the final transcript on stop()", async () => {
+ const onTranscript = vi.fn();
+ const { result } = renderHook(() => useVoiceInput({ onTranscript }));
+
+ await act(async () => {
+ await result.current.start();
+ });
+ expect(result.current.state).toBe("recording");
+ expect(recognitionInstance?.startCalls).toBe(1);
+
+ act(() => {
+ recognitionInstance?.onresult?.({
+ resultIndex: 0,
+ results: {
+ length: 1,
+ 0: { isFinal: true, length: 1, 0: { transcript: "hello world" } },
+ } as unknown,
+ });
+ recognitionInstance?.onend?.();
+ });
+
+ await waitFor(() => {
+ expect(onTranscript).toHaveBeenCalledWith("hello world");
+ expect(result.current.state).toBe("idle");
+ });
+ });
+
+ it("maps a not-allowed permission error to a permission-denied VoiceError", async () => {
+ const onError = vi.fn();
+ const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), onError }));
+
+ await act(async () => {
+ await result.current.start();
+ });
+ act(() => {
+ recognitionInstance?.onerror?.({ error: "not-allowed" });
+ });
+
+ expect(onError).toHaveBeenCalledWith({
+ code: "permission-denied",
+ message: "Microphone permission denied.",
+ });
+ expect(result.current.state).toBe("idle");
+ });
+});
+
+describe("useVoiceInput — capability gating", () => {
+ it("returns supported=false and engine=null when no engine is usable", () => {
+ delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition;
+ delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder;
+ Object.defineProperty(global.navigator, "mediaDevices", { value: {}, configurable: true });
+
+ const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+ expect(result.current.supported).toBe(false);
+ expect(result.current.engine).toBeNull();
+ });
+
+ it("disables the hook entirely when enabled=false", () => {
+ const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), enabled: false }));
+ expect(result.current.supported).toBe(false);
+ expect(result.current.engine).toBeNull();
+ });
+});
+
+describe("useVoiceInput — language preference", () => {
+ it("passes the pinned BCP-47 language to SpeechRecognition.lang", async () => {
+ voicePrefs.value = { ...voicePrefs.value, language: "pt-PT" };
+ const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+
+ await act(async () => {
+ await result.current.start();
+ });
+ expect(recognitionInstance?.lang).toBe("pt-PT");
+ });
+
+ it("falls back to navigator.language when 'auto'", async () => {
+ voicePrefs.value = { ...voicePrefs.value, language: "auto" };
+ Object.defineProperty(global.navigator, "language", { value: "fr-FR", configurable: true });
+ const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+ await act(async () => {
+ await result.current.start();
+ });
+ expect(recognitionInstance?.lang).toBe("fr-FR");
+ });
+});
diff --git a/apps/web/hooks/use-voice-input.ts b/apps/web/hooks/use-voice-input.ts
new file mode 100644
index 000000000..454df30f8
--- /dev/null
+++ b/apps/web/hooks/use-voice-input.ts
@@ -0,0 +1,493 @@
+"use client";
+
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { ApiError } from "@/lib/api/client";
+import { transcribeAudio } from "@/lib/api/domains/voice-api";
+import { detectVoiceCapabilities, resolveActiveEngine } from "@/lib/voice/capabilities";
+import { WhisperWebClient, type WhisperWebProgress } from "@/lib/voice/whisper-web-client";
+import { useAppStore } from "@/components/state-provider";
+import type { VoiceInputEngine, WhisperWebModelSize } from "@/lib/types/http-voice";
+
+// ── Public types ────────────────────────────────────────────────────────
+
+export type VoiceInputState = "idle" | "requesting" | "recording" | "processing";
+
+export type VoiceErrorCode =
+ | "permission-denied"
+ | "no-speech"
+ | "not-configured"
+ | "network"
+ | "unsupported"
+ | "model-load"
+ | "unknown";
+
+export type VoiceError = { code: VoiceErrorCode; message: string };
+
+export type VoiceModelLoadState = {
+ state: "idle" | "loading" | "ready" | "error";
+ progress: number;
+};
+
+export type UseVoiceInputOptions = {
+ onTranscript: (text: string) => void;
+ onError?: (error: VoiceError) => void;
+ /** Set false to disable the hook entirely (e.g. for read-only contexts). */
+ enabled?: boolean;
+};
+
+export type UseVoiceInputResult = {
+ supported: boolean;
+ engine: Exclude | null;
+ state: VoiceInputState;
+ error: VoiceError | null;
+ modelLoad: VoiceModelLoadState;
+ start: () => Promise;
+ stop: () => Promise;
+ cancel: () => void;
+};
+
+// ── Web Speech typings (DOM lib doesn't ship them) ─────────────────────
+
+type SpeechAlt = { transcript: string };
+type SpeechResult = { isFinal: boolean; 0: SpeechAlt; length: number };
+type SpeechResultList = { length: number; [index: number]: SpeechResult };
+type SpeechResultEvent = { resultIndex: number; results: SpeechResultList };
+type SpeechErrorEvent = { error: string; message?: string };
+type SpeechRecognitionInstance = {
+ lang: string;
+ continuous: boolean;
+ interimResults: boolean;
+ maxAlternatives: number;
+ start: () => void;
+ stop: () => void;
+ abort: () => void;
+ onresult: ((ev: SpeechResultEvent) => void) | null;
+ onerror: ((ev: SpeechErrorEvent) => void) | null;
+ onend: (() => void) | null;
+};
+
+type SpeechCtor = new () => SpeechRecognitionInstance;
+
+function createSpeechRecognition(): SpeechRecognitionInstance | null {
+ if (typeof window === "undefined") return null;
+ const w = window as Window & {
+ SpeechRecognition?: SpeechCtor;
+ webkitSpeechRecognition?: SpeechCtor;
+ };
+ const Ctor = w.SpeechRecognition ?? w.webkitSpeechRecognition;
+ return Ctor ? new Ctor() : null;
+}
+
+// ── Error mappers ───────────────────────────────────────────────────────
+
+function mapSpeechError(code: string): VoiceError {
+ if (code === "not-allowed" || code === "service-not-allowed") {
+ return { code: "permission-denied", message: "Microphone permission denied." };
+ }
+ if (code === "no-speech") return { code: "no-speech", message: "No speech detected. Try again." };
+ if (code === "network") {
+ return { code: "network", message: "Voice recognition lost network connection." };
+ }
+ if (code === "audio-capture") return { code: "unknown", message: "No microphone was found." };
+ return { code: "unknown", message: `Voice recognition error: ${code}` };
+}
+
+function mapMicError(err: unknown): VoiceError {
+ if (err && typeof err === "object" && "name" in err) {
+ const name = (err as { name: string }).name;
+ if (name === "NotAllowedError" || name === "SecurityError") {
+ return { code: "permission-denied", message: "Microphone permission denied." };
+ }
+ if (name === "NotFoundError" || name === "OverconstrainedError") {
+ return { code: "unknown", message: "No microphone was found." };
+ }
+ }
+ return { code: "unknown", message: "Failed to start recording." };
+}
+
+function mapTranscribeError(err: unknown): VoiceError {
+ if (err instanceof ApiError && err.status === 503) {
+ return {
+ code: "not-configured",
+ message:
+ "Server-side transcription isn't configured. Pick Web Speech or Whisper Web in Voice Mode settings.",
+ };
+ }
+ return { code: "network", message: "Transcription failed. Please try again." };
+}
+
+function whisperErrorMessage(err: unknown): VoiceError {
+ const message = err instanceof Error ? err.message : "Whisper Web failed to transcribe.";
+ return { code: "model-load", message };
+}
+
+function resolveLang(preference: string): string {
+ if (preference && preference !== "auto") return preference;
+ return typeof navigator !== "undefined" ? navigator.language : "en-US";
+}
+
+function resolveWhisperLang(preference: string): string | undefined {
+ if (!preference || preference === "auto") return undefined;
+ // Whisper's tokenizer only knows ISO 639-1 two-letter codes ("en", "pt").
+ // The settings UI stores BCP-47 ("en-US", "pt-BR") so we can render
+ // human-friendly variant names — strip the region suffix here so the hint
+ // isn't silently dropped by the pipeline (which would then auto-detect and
+ // potentially pick the wrong dialect).
+ const dash = preference.indexOf("-");
+ return dash > 0 ? preference.slice(0, dash).toLowerCase() : preference.toLowerCase();
+}
+
+// ── MediaRecorder capture primitive ─────────────────────────────────────
+
+function pickRecorderMime(): { mime: string; ext: string } {
+ if (typeof window === "undefined" || typeof window.MediaRecorder === "undefined") {
+ return { mime: "", ext: "webm" };
+ }
+ const candidates: Array<{ mime: string; ext: string }> = [
+ { mime: "audio/webm;codecs=opus", ext: "webm" },
+ { mime: "audio/webm", ext: "webm" },
+ { mime: "audio/mp4", ext: "m4a" },
+ { mime: "audio/ogg;codecs=opus", ext: "ogg" },
+ { mime: "audio/wav", ext: "wav" },
+ ];
+ for (const c of candidates) {
+ if (window.MediaRecorder.isTypeSupported(c.mime)) return c;
+ }
+ return { mime: "", ext: "webm" };
+}
+
+type CaptureHandle = {
+ stream: MediaStream;
+ recorder: MediaRecorder;
+ chunks: Blob[];
+ mime: string;
+ ext: string;
+};
+
+async function startCapture(): Promise {
+ const { mime, ext } = pickRecorderMime();
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+ const recorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
+ const chunks: Blob[] = [];
+ recorder.addEventListener("dataavailable", (e) => {
+ if (e.data && e.data.size > 0) chunks.push(e.data);
+ });
+ recorder.start();
+ return { stream, recorder, chunks, mime, ext };
+}
+
+function teardownCapture(handle: CaptureHandle | null) {
+ if (!handle) return;
+ for (const t of handle.stream.getTracks()) t.stop();
+}
+
+function stopCapture(handle: CaptureHandle): Promise {
+ return new Promise((resolve) => {
+ if (handle.recorder.state === "inactive") {
+ teardownCapture(handle);
+ resolve(null);
+ return;
+ }
+ handle.recorder.addEventListener(
+ "stop",
+ () => {
+ const type = handle.recorder.mimeType || handle.mime || "audio/webm";
+ const blob = handle.chunks.length > 0 ? new Blob(handle.chunks, { type }) : null;
+ teardownCapture(handle);
+ resolve(blob);
+ },
+ { once: true },
+ );
+ handle.recorder.stop();
+ });
+}
+
+// ── Driver refs ─────────────────────────────────────────────────────────
+
+type ActiveDriverRef =
+ | { kind: "webSpeech"; recognition: SpeechRecognitionInstance }
+ | { kind: "capture"; handle: CaptureHandle; engine: "whisperWeb" | "whisperServer" }
+ | null;
+
+type DriverRefBox = { current: ActiveDriverRef };
+type WhisperRefBox = { current: WhisperWebClient | null };
+
+function abortDriver(ref: DriverRefBox) {
+ const driver = ref.current;
+ if (!driver) return;
+ if (driver.kind === "webSpeech") {
+ // Detach callbacks before aborting so the trailing onerror/onend events
+ // that some browsers fire after .abort() don't sneak through and mutate
+ // hook state that the caller (cancel()) just reset.
+ driver.recognition.onresult = null;
+ driver.recognition.onerror = null;
+ driver.recognition.onend = null;
+ driver.recognition.abort();
+ } else teardownCapture(driver.handle);
+ ref.current = null;
+}
+
+// ── Web Speech driver ───────────────────────────────────────────────────
+
+type WebSpeechHandlers = {
+ setState: (s: VoiceInputState) => void;
+ driverRef: DriverRefBox;
+ emitError: (e: VoiceError) => void;
+ onTranscriptRef: { current: (text: string) => void };
+ lang: string;
+};
+
+function runWebSpeech(h: WebSpeechHandlers): void {
+ const recognition = createSpeechRecognition();
+ if (!recognition) {
+ h.emitError({ code: "unsupported", message: "Voice recognition is not supported." });
+ return;
+ }
+ const transcripts: string[] = [];
+ recognition.continuous = true;
+ recognition.interimResults = false;
+ recognition.maxAlternatives = 1;
+ recognition.lang = h.lang;
+ recognition.onresult = (ev) => {
+ for (let i = ev.resultIndex; i < ev.results.length; i++) {
+ const r = ev.results[i];
+ if (r.isFinal && r[0]?.transcript) transcripts.push(r[0].transcript.trim());
+ }
+ };
+ recognition.onerror = (ev) => h.emitError(mapSpeechError(ev.error));
+ recognition.onend = () => {
+ h.driverRef.current = null;
+ h.setState("idle");
+ const joined = transcripts.join(" ").trim();
+ if (joined) h.onTranscriptRef.current(joined);
+ };
+ try {
+ recognition.start();
+ h.driverRef.current = { kind: "webSpeech", recognition };
+ h.setState("recording");
+ } catch {
+ h.emitError({ code: "unknown", message: "Failed to start voice recognition." });
+ }
+}
+
+// ── Capture engines (whisperWeb + whisperServer) ───────────────────────
+
+type CaptureHandlers = {
+ setState: (s: VoiceInputState) => void;
+ emitError: (e: VoiceError) => void;
+ driverRef: DriverRefBox;
+};
+
+async function beginCapture(
+ which: "whisperWeb" | "whisperServer",
+ h: CaptureHandlers,
+): Promise {
+ h.setState("requesting");
+ try {
+ const handle = await startCapture();
+ h.driverRef.current = { kind: "capture", handle, engine: which };
+ h.setState("recording");
+ } catch (err) {
+ h.emitError(mapMicError(err));
+ }
+}
+
+type FinishCaptureHandlers = {
+ driverRef: DriverRefBox;
+ whisperRef: WhisperRefBox;
+ setState: (s: VoiceInputState) => void;
+ setModelLoad: (next: VoiceModelLoadState) => void;
+ emitError: (e: VoiceError) => void;
+ onTranscriptRef: { current: (text: string) => void };
+ whisperModel: WhisperWebModelSize;
+ language: string;
+};
+
+async function finishCapture(h: FinishCaptureHandlers): Promise {
+ const driver = h.driverRef.current;
+ if (!driver || driver.kind !== "capture") return;
+ // Claim the driver synchronously *before* the first await. In hold mode,
+ // pointerup + pointerleave both fire in the same task and both call stop();
+ // without this early null, the second invocation would also enter
+ // finishCapture, race the first, and could clobber a brand-new recording's
+ // driverRef if the user re-triggered between them.
+ h.driverRef.current = null;
+ h.setState("processing");
+ const blob = await stopCapture(driver.handle);
+ if (!blob) {
+ h.setState("idle");
+ return;
+ }
+ try {
+ const text =
+ driver.engine === "whisperServer"
+ ? await transcribeViaServer(blob, driver.handle.ext)
+ : await transcribeViaWhisperWeb(blob, h);
+ if (text) h.onTranscriptRef.current(text);
+ h.setState("idle");
+ } catch (err) {
+ if (driver.engine === "whisperServer") h.emitError(mapTranscribeError(err));
+ else h.emitError(whisperErrorMessage(err));
+ }
+}
+
+async function transcribeViaServer(blob: Blob, ext: string): Promise {
+ const result = await transcribeAudio(blob, `recording.${ext}`);
+ return result.text.trim();
+}
+
+async function transcribeViaWhisperWeb(blob: Blob, h: FinishCaptureHandlers): Promise {
+ const client = await ensureWhisperClient(h);
+ const text = await client.transcribe(blob, resolveWhisperLang(h.language));
+ return text.trim();
+}
+
+async function ensureWhisperClient(h: FinishCaptureHandlers): Promise {
+ if (!h.whisperRef.current) {
+ h.whisperRef.current = new WhisperWebClient({
+ onProgress: (p: WhisperWebProgress) =>
+ // transformers.js emits progress on a 0–100 scale, but the rest of the
+ // pipeline (and the button's `* 100` display) treats `modelLoad.progress`
+ // as a 0–1 fraction (matching the `ready: 1` convention below). Normalise
+ // here so the button doesn't render "5000%" mid-download.
+ h.setModelLoad({ state: "loading", progress: p.progress / 100 }),
+ });
+ h.setModelLoad({ state: "loading", progress: 0 });
+ }
+ try {
+ await h.whisperRef.current.init(h.whisperModel);
+ h.setModelLoad({ state: "ready", progress: 1 });
+ } catch (err) {
+ h.setModelLoad({ state: "error", progress: 0 });
+ throw err;
+ }
+ return h.whisperRef.current;
+}
+
+// ── Hook helpers ────────────────────────────────────────────────────────
+
+function useVoiceModePrefs() {
+ return useAppStore((s) => s.userSettings.voiceMode);
+}
+
+function useCallbackRefs(opts: UseVoiceInputOptions) {
+ const onTranscriptRef = useRef(opts.onTranscript);
+ const onErrorRef = useRef(opts.onError);
+ useEffect(() => {
+ onTranscriptRef.current = opts.onTranscript;
+ onErrorRef.current = opts.onError;
+ });
+ return { onTranscriptRef, onErrorRef };
+}
+
+// Re-init the whisper client whenever the user switches model size, so we
+// don't keep an old in-memory model around when the next start() runs.
+function useDisposeWhisperOnModelChange(
+ whisperRef: WhisperRefBox,
+ modelSize: string,
+ reset: () => void,
+) {
+ const previousModelRef = useRef(modelSize);
+ useEffect(() => {
+ if (previousModelRef.current === modelSize) return;
+ previousModelRef.current = modelSize;
+ whisperRef.current?.dispose();
+ whisperRef.current = null;
+ reset();
+ }, [modelSize, whisperRef, reset]);
+}
+
+function useUnmountCleanup(driverRef: DriverRefBox, whisperRef: WhisperRefBox) {
+ useEffect(() => {
+ return () => {
+ abortDriver(driverRef);
+ whisperRef.current?.dispose();
+ whisperRef.current = null;
+ };
+ }, [driverRef, whisperRef]);
+}
+
+// ── Hook ────────────────────────────────────────────────────────────────
+
+export function useVoiceInput(opts: UseVoiceInputOptions): UseVoiceInputResult {
+ const caps = useMemo(() => detectVoiceCapabilities(), []);
+ const prefs = useVoiceModePrefs();
+ const enabled = opts.enabled !== false;
+ const engine = useMemo(
+ () => (enabled ? resolveActiveEngine(prefs.engine, caps, true) : null),
+ [enabled, prefs.engine, caps],
+ );
+ const supported = engine !== null;
+
+ const [state, setState] = useState("idle");
+ const [error, setError] = useState(null);
+ const [modelLoad, setModelLoad] = useState({
+ state: "idle",
+ progress: 0,
+ });
+
+ const driverRef = useRef(null);
+ const whisperRef = useRef(null);
+ const { onTranscriptRef, onErrorRef } = useCallbackRefs(opts);
+
+ const emitError = useCallback(
+ (e: VoiceError) => {
+ setError(e);
+ setState("idle");
+ onErrorRef.current?.(e);
+ },
+ [onErrorRef],
+ );
+
+ const resetModelLoad = useCallback(() => setModelLoad({ state: "idle", progress: 0 }), []);
+
+ useUnmountCleanup(driverRef, whisperRef);
+ useDisposeWhisperOnModelChange(whisperRef, prefs.whisperWebModel, resetModelLoad);
+
+ const start = useCallback(async () => {
+ if (!supported || !engine) {
+ emitError({ code: "unsupported", message: "Voice input is not supported in this browser." });
+ return;
+ }
+ if (state !== "idle") return;
+ setError(null);
+ if (engine === "webSpeech") {
+ runWebSpeech({
+ setState,
+ driverRef,
+ emitError,
+ onTranscriptRef,
+ lang: resolveLang(prefs.language),
+ });
+ return;
+ }
+ await beginCapture(engine, { setState, emitError, driverRef });
+ }, [supported, engine, state, emitError, prefs.language, onTranscriptRef]);
+
+ const stop = useCallback(async () => {
+ const driver = driverRef.current;
+ if (!driver) return;
+ if (driver.kind === "webSpeech") {
+ driver.recognition.stop();
+ return;
+ }
+ await finishCapture({
+ driverRef,
+ whisperRef,
+ setState,
+ setModelLoad,
+ emitError,
+ onTranscriptRef,
+ whisperModel: prefs.whisperWebModel,
+ language: prefs.language,
+ });
+ }, [emitError, prefs.whisperWebModel, prefs.language, onTranscriptRef]);
+
+ const cancel = useCallback(() => {
+ abortDriver(driverRef);
+ setState("idle");
+ setError(null);
+ }, []);
+
+ return { supported, engine, state, error, modelLoad, start, stop, cancel };
+}
diff --git a/apps/web/lib/api/domains/settings-api.ts b/apps/web/lib/api/domains/settings-api.ts
index 343e30efa..ec9b229be 100644
--- a/apps/web/lib/api/domains/settings-api.ts
+++ b/apps/web/lib/api/domains/settings-api.ts
@@ -21,6 +21,7 @@ import type {
UserSettingsResponse,
DynamicModelsResponse,
} from "@/lib/types/http";
+import type { VoiceModeSettings } from "@/lib/types/http-voice";
// User settings
export async function fetchUserSettings(options?: ApiRequestOptions) {
@@ -52,6 +53,7 @@ export async function updateUserSettings(
terminal_font_family?: string;
terminal_font_size?: number;
changes_panel_layout?: "flat" | "tree";
+ voice_mode?: VoiceModeSettings;
},
options?: ApiRequestOptions,
) {
diff --git a/apps/web/lib/api/domains/voice-api.test.ts b/apps/web/lib/api/domains/voice-api.test.ts
new file mode 100644
index 000000000..d3618cae8
--- /dev/null
+++ b/apps/web/lib/api/domains/voice-api.test.ts
@@ -0,0 +1,63 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { ApiError } from "../client";
+import { transcribeAudio } from "./voice-api";
+
+const originalFetch = global.fetch;
+
+describe("transcribeAudio", () => {
+ afterEach(() => {
+ global.fetch = originalFetch;
+ });
+
+ beforeEach(() => {
+ vi.restoreAllMocks();
+ });
+
+ it("posts multipart/form-data with the audio under the 'audio' field", async () => {
+ let capturedRequest: { method?: string; bodyText: string } = { bodyText: "" };
+ global.fetch = vi.fn(async (_url: RequestInfo | URL, init?: RequestInit) => {
+ capturedRequest = {
+ method: init?.method,
+ bodyText: init?.body instanceof FormData ? "" : String(init?.body),
+ };
+ return new Response(JSON.stringify({ text: "hi" }), {
+ status: 200,
+ headers: { "Content-Type": "application/json" },
+ });
+ }) as unknown as typeof fetch;
+
+ const blob = new Blob([new Uint8Array([1, 2, 3])], { type: "audio/webm" });
+ const result = await transcribeAudio(blob, "clip.webm", {
+ baseUrl: "http://example.test",
+ });
+
+ expect(result.text).toBe("hi");
+ expect(capturedRequest.method).toBe("POST");
+ expect(capturedRequest.bodyText).toBe("");
+ });
+
+ it("throws ApiError(503) when the server reports not-configured", async () => {
+ global.fetch = vi.fn(
+ async () =>
+ new Response(JSON.stringify({ error: "voice transcription is not configured" }), {
+ status: 503,
+ }),
+ ) as unknown as typeof fetch;
+
+ const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" });
+ await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toMatchObject({
+ status: 503,
+ });
+ });
+
+ it("surfaces non-2xx errors as ApiError instances", async () => {
+ global.fetch = vi.fn(
+ async () => new Response("bad", { status: 502, statusText: "Bad Gateway" }),
+ ) as unknown as typeof fetch;
+
+ const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" });
+ await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toBeInstanceOf(
+ ApiError,
+ );
+ });
+});
diff --git a/apps/web/lib/api/domains/voice-api.ts b/apps/web/lib/api/domains/voice-api.ts
new file mode 100644
index 000000000..d3af1a571
--- /dev/null
+++ b/apps/web/lib/api/domains/voice-api.ts
@@ -0,0 +1,51 @@
+import { ApiError, type ApiRequestOptions } from "../client";
+import { getBackendConfig } from "@/lib/config";
+
+export type TranscribeResponse = {
+ text: string;
+};
+
+/**
+ * POST audio to the backend Whisper fallback. Returns the transcribed text.
+ *
+ * Throws ApiError on non-2xx. Two statuses are meaningful to the caller:
+ * - 503: server has no API key configured — the hook should treat the
+ * Whisper fallback as unavailable and surface a clean message.
+ * - any other non-2xx: transient error — show a generic toast.
+ */
+export async function transcribeAudio(
+ blob: Blob,
+ filename: string,
+ options?: ApiRequestOptions,
+): Promise {
+ const baseUrl = options?.baseUrl ?? getBackendConfig().apiBaseUrl;
+ const formData = new FormData();
+ formData.append("audio", blob, filename);
+
+ // Do NOT set Content-Type: the browser sets multipart/form-data with the
+ // correct boundary automatically when given a FormData body. Spread caller
+ // init *first* so method/body always win — otherwise a caller passing
+ // `init: { method: "GET" }` (or a stale body) would silently break the upload.
+ const response = await fetch(`${baseUrl}/api/v1/transcribe`, {
+ ...options?.init,
+ method: "POST",
+ body: formData,
+ });
+
+ if (!response.ok) {
+ let body: unknown = null;
+ try {
+ body = await response.json();
+ } catch {
+ // body remains null
+ }
+ let message = `Transcription failed: ${response.status} ${response.statusText}`;
+ if (body && typeof body === "object" && "error" in body) {
+ const errVal = (body as { error?: unknown }).error;
+ if (typeof errVal === "string") message = errVal;
+ }
+ throw new ApiError(message, response.status, body);
+ }
+
+ return (await response.json()) as TranscribeResponse;
+}
diff --git a/apps/web/lib/keyboard/constants.ts b/apps/web/lib/keyboard/constants.ts
index 31271c0b9..e05ab2373 100644
--- a/apps/web/lib/keyboard/constants.ts
+++ b/apps/web/lib/keyboard/constants.ts
@@ -153,4 +153,10 @@ export const SHORTCUTS = {
key: KEYS.F,
modifiers: { ctrlOrCmd: true },
},
+ // Cmd+Shift+M starts/stops voice input on the chat composer. The default
+ // is configurable per-user via the Voice Mode settings page.
+ VOICE_INPUT_TOGGLE: {
+ key: KEYS.M,
+ modifiers: { ctrlOrCmd: true, shift: true },
+ },
} as const;
diff --git a/apps/web/lib/keyboard/shortcut-overrides.test.ts b/apps/web/lib/keyboard/shortcut-overrides.test.ts
index 6453bc902..43c59c3df 100644
--- a/apps/web/lib/keyboard/shortcut-overrides.test.ts
+++ b/apps/web/lib/keyboard/shortcut-overrides.test.ts
@@ -20,7 +20,8 @@ describe("CONFIGURABLE_SHORTCUTS", () => {
expect(ids).toContain("FOCUS_INPUT");
expect(ids).toContain("TOGGLE_PLAN_MODE");
expect(ids).toContain("TASK_SWITCHER");
- expect(ids).toHaveLength(10);
+ expect(ids).toContain("VOICE_INPUT_TOGGLE");
+ expect(ids).toHaveLength(11);
});
it("each entry has a label and default matching SHORTCUTS", () => {
diff --git a/apps/web/lib/keyboard/shortcut-overrides.ts b/apps/web/lib/keyboard/shortcut-overrides.ts
index 8ac1b7a37..a31d61e15 100644
--- a/apps/web/lib/keyboard/shortcut-overrides.ts
+++ b/apps/web/lib/keyboard/shortcut-overrides.ts
@@ -10,7 +10,8 @@ export type ConfigurableShortcutId =
| "NEW_TASK"
| "FOCUS_INPUT"
| "TOGGLE_PLAN_MODE"
- | "TASK_SWITCHER";
+ | "TASK_SWITCHER"
+ | "VOICE_INPUT_TOGGLE";
export type StoredShortcutOverrides = Record<
string,
@@ -31,6 +32,7 @@ export const CONFIGURABLE_SHORTCUTS: Record<
FOCUS_INPUT: { label: "Focus Chat Input", default: SHORTCUTS.FOCUS_INPUT },
TOGGLE_PLAN_MODE: { label: "Toggle Plan Mode", default: SHORTCUTS.TOGGLE_PLAN_MODE },
TASK_SWITCHER: { label: "Recent Task Switcher", default: SHORTCUTS.TASK_SWITCHER },
+ VOICE_INPUT_TOGGLE: { label: "Voice Input", default: SHORTCUTS.VOICE_INPUT_TOGGLE },
};
export function getShortcut(
diff --git a/apps/web/lib/ssr/user-settings.test.ts b/apps/web/lib/ssr/user-settings.test.ts
index 04f425b0d..38b681b6e 100644
--- a/apps/web/lib/ssr/user-settings.test.ts
+++ b/apps/web/lib/ssr/user-settings.test.ts
@@ -1,5 +1,10 @@
import { describe, it, expect } from "vitest";
-import { buildCoreFields, mapUserSettingsResponse, parseChangesPanelLayout } from "./user-settings";
+import {
+ buildCoreFields,
+ mapUserSettingsResponse,
+ parseChangesPanelLayout,
+ parseVoiceMode,
+} from "./user-settings";
describe("buildCoreFields", () => {
it("maps terminal_font_family to terminalFontFamily", () => {
@@ -103,3 +108,78 @@ describe("parseChangesPanelLayout", () => {
expect(parseChangesPanelLayout("")).toBe("flat");
});
});
+
+describe("parseVoiceMode", () => {
+ it("maps every field from the snake_case wire payload", () => {
+ expect(
+ parseVoiceMode({
+ enabled: false,
+ engine: "whisperWeb",
+ language: "pt-PT",
+ mode: "hold",
+ auto_send: true,
+ whisper_web_model: "small",
+ }),
+ ).toEqual({
+ enabled: false,
+ engine: "whisperWeb",
+ language: "pt-PT",
+ mode: "hold",
+ autoSend: true,
+ whisperWebModel: "small",
+ });
+ });
+
+ it("returns the defaults when the payload is undefined", () => {
+ expect(parseVoiceMode(undefined)).toEqual({
+ enabled: true,
+ engine: "auto",
+ language: "auto",
+ mode: "toggle",
+ autoSend: false,
+ whisperWebModel: "base",
+ });
+ });
+
+ it("defaults enabled to true when the wire payload omits the field (old rows)", () => {
+ const result = parseVoiceMode({
+ engine: "auto",
+ language: "auto",
+ mode: "toggle",
+ auto_send: false,
+ whisper_web_model: "base",
+ } as unknown as Parameters[0]);
+ expect(result.enabled).toBe(true);
+ });
+
+ it("fills in defaults for missing string fields and coerces auto_send to false", () => {
+ const result = parseVoiceMode({
+ engine: "" as unknown as "auto",
+ language: "",
+ mode: "" as unknown as "toggle",
+ whisper_web_model: "" as unknown as "base",
+ } as unknown as Parameters[0]);
+ expect(result).toEqual({
+ enabled: true,
+ engine: "auto",
+ language: "auto",
+ mode: "toggle",
+ autoSend: false,
+ whisperWebModel: "base",
+ });
+ });
+});
+
+describe("mapUserSettingsResponse voice mode", () => {
+ it("defaults the whole voiceMode object when response is null", () => {
+ const result = mapUserSettingsResponse(null);
+ expect(result.voiceMode).toEqual({
+ enabled: true,
+ engine: "auto",
+ language: "auto",
+ mode: "toggle",
+ autoSend: false,
+ whisperWebModel: "base",
+ });
+ });
+});
diff --git a/apps/web/lib/ssr/user-settings.ts b/apps/web/lib/ssr/user-settings.ts
index 74a3d127f..b2ed73508 100644
--- a/apps/web/lib/ssr/user-settings.ts
+++ b/apps/web/lib/ssr/user-settings.ts
@@ -1,6 +1,8 @@
import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire";
import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types";
+import { DEFAULT_VOICE_MODE_STATE, type VoiceModeState } from "@/lib/state/slices/settings/types";
import type { SavedLayout, UserSettingsResponse } from "@/lib/types/http";
+import type { VoiceModeSettings } from "@/lib/types/http-voice";
export type UserSettingsData = NonNullable;
@@ -12,6 +14,25 @@ export function parseChangesPanelLayout(value: string | undefined): "flat" | "tr
return value === "tree" ? "tree" : "flat";
}
+/**
+ * Maps the backend's snake_case VoiceMode payload into the camelCase shape
+ * the store and UI use. Missing or partial payloads fall back to the defaults
+ * so an old user row (written before VoiceMode existed) doesn't surface as
+ * an empty string the radio groups can't render. `enabled` defaults to true
+ * for users who haven't toggled it — voice mode is opt-out, not opt-in.
+ */
+export function parseVoiceMode(value: VoiceModeSettings | undefined): VoiceModeState {
+ if (!value) return { ...DEFAULT_VOICE_MODE_STATE };
+ return {
+ enabled: typeof value.enabled === "boolean" ? value.enabled : true,
+ engine: value.engine || DEFAULT_VOICE_MODE_STATE.engine,
+ language: value.language || DEFAULT_VOICE_MODE_STATE.language,
+ mode: value.mode || DEFAULT_VOICE_MODE_STATE.mode,
+ autoSend: typeof value.auto_send === "boolean" ? value.auto_send : false,
+ whisperWebModel: value.whisper_web_model || DEFAULT_VOICE_MODE_STATE.whisperWebModel,
+ };
+}
+
function buildTerminalFields(s: UserSettingsData) {
return {
terminalLinkBehavior: parseTerminalLinkBehavior(s.terminal_link_behavior),
@@ -21,6 +42,10 @@ function buildTerminalFields(s: UserSettingsData) {
};
}
+function buildVoiceModeFields(s: UserSettingsData) {
+ return { voiceMode: parseVoiceMode(s.voice_mode) };
+}
+
function buildIdentityFields(s: UserSettingsData) {
return {
workspaceId: s.workspace_id || null,
@@ -51,6 +76,7 @@ export function buildCoreFields(s: UserSettingsData) {
savedLayouts: s.saved_layouts ?? [],
sidebarViews: (s.sidebar_views ?? []).map(fromApiSidebarView) as SidebarView[],
...buildTerminalFields(s),
+ ...buildVoiceModeFields(s),
};
}
@@ -91,6 +117,7 @@ export function mapUserSettingsResponse(response: UserSettingsResponse | null) {
terminalFontFamily: null,
terminalFontSize: null,
changesPanelLayout: "flat" as const,
+ voiceMode: { ...DEFAULT_VOICE_MODE_STATE },
...buildLspFields(undefined),
loaded: false,
};
diff --git a/apps/web/lib/state/slices/settings/settings-slice.ts b/apps/web/lib/state/slices/settings/settings-slice.ts
index 26ce9c67b..d9dca4acb 100644
--- a/apps/web/lib/state/slices/settings/settings-slice.ts
+++ b/apps/web/lib/state/slices/settings/settings-slice.ts
@@ -1,5 +1,5 @@
import type { StateCreator } from "zustand";
-import type { SettingsSlice, SettingsSliceState } from "./types";
+import { DEFAULT_VOICE_MODE_STATE, type SettingsSlice, type SettingsSliceState } from "./types";
export const defaultSettingsState: SettingsSliceState = {
executors: { items: [] },
@@ -44,6 +44,7 @@ export const defaultSettingsState: SettingsSliceState = {
terminalFontFamily: null,
terminalFontSize: null,
changesPanelLayout: "flat",
+ voiceMode: { ...DEFAULT_VOICE_MODE_STATE },
loaded: false,
},
};
diff --git a/apps/web/lib/state/slices/settings/types.ts b/apps/web/lib/state/slices/settings/types.ts
index 73f094761..ca7740a93 100644
--- a/apps/web/lib/state/slices/settings/types.ts
+++ b/apps/web/lib/state/slices/settings/types.ts
@@ -11,6 +11,11 @@ import type {
SavedLayout,
ToolStatus,
} from "@/lib/types/http";
+import type {
+ VoiceInputActivationMode,
+ VoiceInputEngine,
+ WhisperWebModelSize,
+} from "@/lib/types/http-voice";
import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types";
import type { SecretListItem } from "@/lib/types/http-secrets";
import type { SpritesStatus, SpritesInstance } from "@/lib/types/http-sprites";
@@ -156,9 +161,29 @@ export type UserSettingsState = {
terminalFontFamily: string | null;
terminalFontSize: number | null;
changesPanelLayout: "flat" | "tree";
+ voiceMode: VoiceModeState;
loaded: boolean;
};
+export type VoiceModeState = {
+ enabled: boolean;
+ engine: VoiceInputEngine;
+ language: string;
+ mode: VoiceInputActivationMode;
+ autoSend: boolean;
+ whisperWebModel: WhisperWebModelSize;
+};
+
+/** Default values used by the slice init and by SSR hydration fallback. */
+export const DEFAULT_VOICE_MODE_STATE: VoiceModeState = {
+ enabled: true,
+ engine: "auto",
+ language: "auto",
+ mode: "toggle",
+ autoSend: false,
+ whisperWebModel: "base",
+};
+
export type SettingsSliceState = {
executors: ExecutorsState;
settingsAgents: SettingsAgentsState;
diff --git a/apps/web/lib/types/backend.ts b/apps/web/lib/types/backend.ts
index c97912e3c..448dba0f9 100644
--- a/apps/web/lib/types/backend.ts
+++ b/apps/web/lib/types/backend.ts
@@ -383,6 +383,7 @@ export type UserSettingsUpdatedPayload = {
keyboard_shortcuts?: Record }>;
terminal_link_behavior?: string;
changes_panel_layout?: "flat" | "tree";
+ voice_mode?: import("@/lib/types/http-voice").VoiceModeSettings;
updated_at?: string;
};
diff --git a/apps/web/lib/types/http-voice.ts b/apps/web/lib/types/http-voice.ts
new file mode 100644
index 000000000..c43351524
--- /dev/null
+++ b/apps/web/lib/types/http-voice.ts
@@ -0,0 +1,17 @@
+/**
+ * Wire types for the Voice Mode user settings. Kept in their own module so
+ * http.ts stays under the 600-line file limit.
+ */
+
+export type VoiceInputEngine = "auto" | "webSpeech" | "whisperWeb" | "whisperServer";
+export type VoiceInputActivationMode = "toggle" | "hold";
+export type WhisperWebModelSize = "tiny" | "base" | "small";
+
+export type VoiceModeSettings = {
+ enabled: boolean;
+ engine: VoiceInputEngine;
+ language: string;
+ mode: VoiceInputActivationMode;
+ auto_send: boolean;
+ whisper_web_model: WhisperWebModelSize;
+};
diff --git a/apps/web/lib/types/http.ts b/apps/web/lib/types/http.ts
index fae94bf0c..0953be4c3 100644
--- a/apps/web/lib/types/http.ts
+++ b/apps/web/lib/types/http.ts
@@ -406,6 +406,8 @@ export type SidebarViewApi = {
collapsed_groups: string[];
};
+import type { VoiceModeSettings } from "./http-voice";
+
export type UserSettings = {
user_id: string;
workspace_id: WorkspaceId;
@@ -432,6 +434,7 @@ export type UserSettings = {
terminal_font_family?: string;
terminal_font_size?: number;
changes_panel_layout?: "flat" | "tree";
+ voice_mode?: VoiceModeSettings;
updated_at: string;
};
diff --git a/apps/web/lib/voice/capabilities.test.ts b/apps/web/lib/voice/capabilities.test.ts
new file mode 100644
index 000000000..d8b8d7191
--- /dev/null
+++ b/apps/web/lib/voice/capabilities.test.ts
@@ -0,0 +1,97 @@
+import { describe, it, expect, afterEach, vi } from "vitest";
+import { detectVoiceCapabilities, resolveActiveEngine } from "./capabilities";
+
+describe("detectVoiceCapabilities", () => {
+ afterEach(() => {
+ vi.unstubAllGlobals();
+ delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition;
+ delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition;
+ delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder;
+ });
+
+ it("reports webSpeech true when window.SpeechRecognition exists", () => {
+ (window as unknown as { SpeechRecognition: () => void }).SpeechRecognition = () => {};
+ expect(detectVoiceCapabilities().webSpeech).toBe(true);
+ });
+
+ it("reports webSpeech true on the prefixed webkit variant too", () => {
+ (window as unknown as { webkitSpeechRecognition: () => void }).webkitSpeechRecognition =
+ () => {};
+ expect(detectVoiceCapabilities().webSpeech).toBe(true);
+ });
+
+ it("reports audioCapture true when MediaRecorder + getUserMedia are present", () => {
+ (window as unknown as { MediaRecorder: object }).MediaRecorder = {
+ isTypeSupported: () => true,
+ };
+ vi.stubGlobal("navigator", { mediaDevices: { getUserMedia: () => Promise.resolve({}) } });
+ expect(detectVoiceCapabilities().audioCapture).toBe(true);
+ });
+
+ it("reports everything false when no APIs are available", () => {
+ vi.stubGlobal("navigator", {});
+ expect(detectVoiceCapabilities()).toEqual({
+ webSpeech: false,
+ whisperWeb: false,
+ audioCapture: false,
+ });
+ });
+});
+
+describe("resolveActiveEngine", () => {
+ const allAvailable = { webSpeech: true, whisperWeb: true, audioCapture: true };
+
+ it("auto picks webSpeech first when available", () => {
+ expect(resolveActiveEngine("auto", allAvailable, true)).toBe("webSpeech");
+ });
+
+ it("auto falls back to whisperWeb when webSpeech is missing", () => {
+ expect(
+ resolveActiveEngine("auto", { webSpeech: false, whisperWeb: true, audioCapture: true }, true),
+ ).toBe("whisperWeb");
+ });
+
+ it("auto falls back to whisperServer when no in-browser engine is available", () => {
+ expect(
+ resolveActiveEngine(
+ "auto",
+ { webSpeech: false, whisperWeb: false, audioCapture: true },
+ true,
+ ),
+ ).toBe("whisperServer");
+ });
+
+ it("returns null when nothing is usable", () => {
+ expect(
+ resolveActiveEngine(
+ "auto",
+ { webSpeech: false, whisperWeb: false, audioCapture: false },
+ true,
+ ),
+ ).toBeNull();
+ });
+
+ it("honors a pinned engine when usable", () => {
+ expect(resolveActiveEngine("whisperWeb", allAvailable, true)).toBe("whisperWeb");
+ });
+
+ it("falls back along the auto order when the pinned engine is missing", () => {
+ expect(
+ resolveActiveEngine(
+ "whisperWeb",
+ { webSpeech: true, whisperWeb: false, audioCapture: true },
+ true,
+ ),
+ ).toBe("webSpeech");
+ });
+
+ it("treats whisperServer as unusable when serverFallbackEnabled is false", () => {
+ expect(
+ resolveActiveEngine(
+ "whisperServer",
+ { webSpeech: false, whisperWeb: false, audioCapture: true },
+ false,
+ ),
+ ).toBeNull();
+ });
+});
diff --git a/apps/web/lib/voice/capabilities.ts b/apps/web/lib/voice/capabilities.ts
new file mode 100644
index 000000000..6fd36f161
--- /dev/null
+++ b/apps/web/lib/voice/capabilities.ts
@@ -0,0 +1,75 @@
+"use client";
+
+import type { VoiceInputEngine } from "@/lib/types/http-voice";
+
+/**
+ * Capability report for the voice-mode engines available in the current
+ * browser. Shared between `useVoiceInput` (which picks the active engine)
+ * and the Voice Mode settings page (which decides which options to render).
+ */
+export type VoiceCapabilities = {
+ webSpeech: boolean;
+ whisperWeb: boolean;
+ /** True if the browser supports MediaRecorder + getUserMedia, the floor
+ * for any audio-capture engine (whisperWeb + whisperServer). */
+ audioCapture: boolean;
+};
+
+/**
+ * Detects which voice engines this browser can run. Safe to call during
+ * SSR — returns all-false instead of throwing on missing globals.
+ */
+export function detectVoiceCapabilities(): VoiceCapabilities {
+ if (typeof window === "undefined") {
+ return { webSpeech: false, whisperWeb: false, audioCapture: false };
+ }
+ const w = window as Window & {
+ SpeechRecognition?: unknown;
+ webkitSpeechRecognition?: unknown;
+ };
+ const webSpeech = !!(w.SpeechRecognition || w.webkitSpeechRecognition);
+ const audioCapture =
+ typeof navigator !== "undefined" &&
+ typeof navigator.mediaDevices?.getUserMedia === "function" &&
+ typeof window.MediaRecorder !== "undefined";
+ // whisper-web piggybacks on transformers.js which only needs a Worker plus
+ // either WebGPU or WebAssembly. Every modern browser has both, so the
+ // gating constraint is having MediaRecorder for capture.
+ const whisperWeb = audioCapture && typeof Worker !== "undefined";
+ return { webSpeech, whisperWeb, audioCapture };
+}
+
+/**
+ * Resolves the active voice-input engine given a user preference and the
+ * detected capabilities. Returns null when nothing usable is available.
+ *
+ * Auto-fallback order: Web Speech (cheapest, native) → Whisper Web (private,
+ * heavier) → Whisper Server (always works but requires a configured server).
+ * If the user pinned a specific engine that isn't available, we degrade
+ * gracefully along the same order.
+ */
+export function resolveActiveEngine(
+ preference: VoiceInputEngine,
+ caps: VoiceCapabilities,
+ serverFallbackEnabled: boolean,
+): Exclude | null {
+ const order: Array> = [
+ "webSpeech",
+ "whisperWeb",
+ "whisperServer",
+ ];
+
+ const isUsable = (e: Exclude) => {
+ if (e === "webSpeech") return caps.webSpeech;
+ if (e === "whisperWeb") return caps.whisperWeb;
+ return caps.audioCapture && serverFallbackEnabled;
+ };
+
+ if (preference === "auto") {
+ return order.find(isUsable) ?? null;
+ }
+ if (isUsable(preference)) return preference;
+ // Pinned engine isn't usable — fall through to the next available one in
+ // the auto order so the button still works instead of silently no-op.
+ return order.find(isUsable) ?? null;
+}
diff --git a/apps/web/lib/voice/whisper-web-client.ts b/apps/web/lib/voice/whisper-web-client.ts
new file mode 100644
index 000000000..e9d1cc620
--- /dev/null
+++ b/apps/web/lib/voice/whisper-web-client.ts
@@ -0,0 +1,199 @@
+"use client";
+
+import { whisperModelConfig } from "./whisper-web-models";
+import type { WhisperWebModelSize } from "@/lib/types/http-voice";
+
+/**
+ * Sample rate Whisper expects. We resample the captured audio to this rate
+ * (mono Float32Array) before sending to the worker — Whisper's own decoder
+ * would do this too, but doing it here keeps the worker focused on inference.
+ */
+const WHISPER_SAMPLE_RATE = 16000;
+
+export type WhisperWebProgress = {
+ stage: string;
+ progress: number;
+};
+
+export type WhisperWebHandlers = {
+ onProgress?: (p: WhisperWebProgress) => void;
+};
+
+type WorkerMessage =
+ | { type: "progress"; stage: string; progress: number }
+ | { type: "ready" }
+ | { type: "result"; text: string }
+ | { type: "error"; message: string };
+
+type Pending = {
+ kind: "init" | "transcribe";
+ resolve: (value: string | undefined) => void;
+ reject: (err: Error) => void;
+};
+
+/**
+ * Client wrapper around the whisper-web worker. Hides the postMessage
+ * protocol behind a clean promise-based API and handles the audio decode +
+ * resample step so callers only see "Blob in, transcript out".
+ */
+export class WhisperWebClient {
+ private worker: Worker | null = null;
+ private pending: Pending | null = null;
+ private ready = false;
+ private loadingModelId: string | null = null;
+
+ constructor(private handlers: WhisperWebHandlers = {}) {}
+
+ /**
+ * Lazy-creates the worker on first use. Returns a promise that resolves
+ * when the requested model is loaded and ready to transcribe.
+ */
+ async init(size: WhisperWebModelSize): Promise {
+ const config = whisperModelConfig(size);
+ if (this.ready && this.loadingModelId === config.modelId) return;
+ this.ensureWorker();
+ this.loadingModelId = config.modelId;
+ this.ready = false;
+ await this.send({ kind: "init", payload: { type: "init", model: config.modelId } });
+ this.ready = true;
+ }
+
+ /**
+ * Transcribe a recorded blob. The blob may be in any container the browser
+ * can decode (audio/webm, audio/wav, audio/mp4, …) — we resample everything
+ * to 16 kHz mono Float32 before handing to the worker.
+ */
+ async transcribe(blob: Blob, language?: string): Promise {
+ if (!this.ready || !this.worker) {
+ throw new Error("WhisperWebClient: not initialized");
+ }
+ const audio = await blobToWhisperFloat32(blob);
+ const text = await this.send({
+ kind: "transcribe",
+ payload: { type: "transcribe", audio, language },
+ transfer: [audio.buffer],
+ });
+ return text ?? "";
+ }
+
+ /** Tear down the worker and release the loaded model. */
+ dispose(): void {
+ if (this.worker) {
+ try {
+ this.worker.postMessage({ type: "dispose" });
+ } catch {
+ // ignore
+ }
+ this.worker.terminate();
+ this.worker = null;
+ }
+ this.ready = false;
+ this.loadingModelId = null;
+ if (this.pending) {
+ this.pending.reject(new Error("WhisperWebClient disposed"));
+ this.pending = null;
+ }
+ }
+
+ private ensureWorker() {
+ if (this.worker) return;
+ // The `new Worker(new URL(..., import.meta.url))` form is Next.js / webpack's
+ // recommended pattern — webpack handles the bundling and asset path.
+ this.worker = new Worker(new URL("../../workers/whisper-web.worker.ts", import.meta.url), {
+ type: "module",
+ });
+ this.worker.addEventListener("message", (e: MessageEvent) =>
+ this.handleMessage(e.data),
+ );
+ // Capture the worker reference at listener-attach time. A late error from
+ // a previously-disposed worker can still bubble up after we've already
+ // created its replacement; without the identity check below, that stale
+ // event would terminate the brand-new worker too.
+ const ownWorker = this.worker;
+ this.worker.addEventListener("error", (e) => {
+ const err = new Error(e.message || "Whisper worker crashed");
+ ownWorker?.terminate();
+ // Only clear our refs if this is still the active worker — a stale
+ // error from a worker we already replaced must not nuke the new one.
+ if (this.worker === ownWorker) {
+ this.worker = null;
+ this.ready = false;
+ this.loadingModelId = null;
+ }
+ if (this.pending) {
+ this.pending.reject(err);
+ this.pending = null;
+ }
+ });
+ }
+
+ private send(args: {
+ kind: "init" | "transcribe";
+ payload: object;
+ transfer?: Transferable[];
+ }): Promise {
+ if (!this.worker) throw new Error("WhisperWebClient: worker not initialized");
+ if (this.pending) {
+ return Promise.reject(new Error("WhisperWebClient: another request is in flight"));
+ }
+ return new Promise((resolve, reject) => {
+ this.pending = { kind: args.kind, resolve, reject };
+ this.worker?.postMessage(args.payload, args.transfer ?? []);
+ });
+ }
+
+ private handleMessage(msg: WorkerMessage) {
+ if (msg.type === "progress") {
+ this.handlers.onProgress?.({ stage: msg.stage, progress: msg.progress });
+ return;
+ }
+ const pending = this.pending;
+ if (!pending) return;
+ this.pending = null;
+ if (msg.type === "error") {
+ pending.reject(new Error(msg.message));
+ return;
+ }
+ if (msg.type === "ready") {
+ pending.resolve(undefined);
+ return;
+ }
+ if (msg.type === "result") {
+ pending.resolve(msg.text);
+ }
+ }
+}
+
+/**
+ * Decode an arbitrary audio Blob and return a Float32Array sampled at 16 kHz
+ * mono — the format Whisper expects.
+ */
+export async function blobToWhisperFloat32(blob: Blob): Promise {
+ const arrayBuffer = await blob.arrayBuffer();
+ // Decode using an AudioContext at the source rate, then bounce through an
+ // OfflineAudioContext for the resample. AudioContext.decodeAudioData
+ // tolerates webm/opus, mp4/aac, wav, ogg — anything the browser can play.
+ const AudioCtor =
+ window.AudioContext ??
+ (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext;
+ if (!AudioCtor) throw new Error("AudioContext is not available in this browser");
+ const decodeCtx = new AudioCtor();
+ let decoded: AudioBuffer;
+ try {
+ decoded = await decodeCtx.decodeAudioData(arrayBuffer);
+ } finally {
+ await decodeCtx.close();
+ }
+ return resampleToMono16k(decoded);
+}
+
+async function resampleToMono16k(buf: AudioBuffer): Promise {
+ const length = Math.ceil((buf.duration * WHISPER_SAMPLE_RATE) / 1);
+ const offline = new OfflineAudioContext(1, length, WHISPER_SAMPLE_RATE);
+ const source = offline.createBufferSource();
+ source.buffer = buf;
+ source.connect(offline.destination);
+ source.start(0);
+ const rendered = await offline.startRendering();
+ return rendered.getChannelData(0).slice();
+}
diff --git a/apps/web/lib/voice/whisper-web-models.ts b/apps/web/lib/voice/whisper-web-models.ts
new file mode 100644
index 000000000..eaffe6698
--- /dev/null
+++ b/apps/web/lib/voice/whisper-web-models.ts
@@ -0,0 +1,42 @@
+import type { WhisperWebModelSize } from "@/lib/types/http-voice";
+
+export type WhisperModelConfig = {
+ size: WhisperWebModelSize;
+ /** Hugging Face model id. Use the `onnx-community/*` mirrors — `Xenova/*`
+ * defaults to 4-bit MatMulNBits weights that crash on WASM (see note below). */
+ modelId: string;
+ /** Rough on-disk size after download, shown in the settings UI. */
+ approxBytes: number;
+ /** Human-readable label. */
+ label: string;
+};
+
+// The `onnx-community/whisper-*` mirrors are the maintained transformers.js
+// exports. The older `Xenova/whisper-*` mirrors default to 4-bit (`MatMulNBits`)
+// weights that only run on WebGPU — on WASM they fail with
+// `Missing required scale: ... weight_merged_0_scale`. The onnx-community
+// mirrors include the q8 variant we pin to in the worker.
+export const WHISPER_WEB_MODELS: Record = {
+ tiny: {
+ size: "tiny",
+ modelId: "onnx-community/whisper-tiny",
+ approxBytes: 40 * 1024 * 1024,
+ label: "Whisper Tiny",
+ },
+ base: {
+ size: "base",
+ modelId: "onnx-community/whisper-base",
+ approxBytes: 75 * 1024 * 1024,
+ label: "Whisper Base",
+ },
+ small: {
+ size: "small",
+ modelId: "onnx-community/whisper-small",
+ approxBytes: 240 * 1024 * 1024,
+ label: "Whisper Small",
+ },
+};
+
+export function whisperModelConfig(size: WhisperWebModelSize): WhisperModelConfig {
+ return WHISPER_WEB_MODELS[size] ?? WHISPER_WEB_MODELS.base;
+}
diff --git a/apps/web/lib/ws/handlers/users.ts b/apps/web/lib/ws/handlers/users.ts
index 1ddb7a71c..0b33698d3 100644
--- a/apps/web/lib/ws/handlers/users.ts
+++ b/apps/web/lib/ws/handlers/users.ts
@@ -1,6 +1,7 @@
import type { StoreApi } from "zustand";
import type { AppState } from "@/lib/state/store";
import type { WsHandlers } from "@/lib/ws/handlers/types";
+import { parseVoiceMode } from "@/lib/ssr/user-settings";
export function registerUsersHandlers(store: StoreApi): WsHandlers {
return {
@@ -31,6 +32,7 @@ export function registerUsersHandlers(store: StoreApi): WsHandlers {
? "browser_panel"
: "new_tab",
changesPanelLayout: message.payload.changes_panel_layout === "tree" ? "tree" : "flat",
+ voiceMode: parseVoiceMode(message.payload.voice_mode),
loaded: true,
},
}));
diff --git a/apps/web/package.json b/apps/web/package.json
index 7de93f075..369517e61 100644
--- a/apps/web/package.json
+++ b/apps/web/package.json
@@ -38,6 +38,7 @@
"@dnd-kit/core": "^6.3.1",
"@dnd-kit/sortable": "^10.0.0",
"@dnd-kit/utilities": "^3.2.2",
+ "@huggingface/transformers": "^4.2.0",
"@kandev/theme": "workspace:*",
"@kandev/types": "workspace:*",
"@kandev/ui": "workspace:*",
diff --git a/apps/web/workers/whisper-web.worker.ts b/apps/web/workers/whisper-web.worker.ts
new file mode 100644
index 000000000..68fa33e4b
--- /dev/null
+++ b/apps/web/workers/whisper-web.worker.ts
@@ -0,0 +1,138 @@
+///
+
+/**
+ * Web Worker that runs OpenAI Whisper entirely in the browser via
+ * @huggingface/transformers (the maintained transformers.js library that
+ * xenova/whisper-web is built on).
+ *
+ * Lives in its own worker because model loading + inference both block the
+ * main thread for several seconds — would freeze the chat input otherwise.
+ *
+ * Wire protocol (postMessage):
+ * in: { type: "init", model: "onnx-community/whisper-base" }
+ * in: { type: "transcribe", audio: Float32Array, language?: string }
+ * in: { type: "dispose" }
+ * out: { type: "progress", stage: string, progress: number }
+ * out: { type: "ready" }
+ * out: { type: "result", text: string }
+ * out: { type: "error", message: string }
+ */
+
+import { pipeline, env, type AutomaticSpeechRecognitionPipeline } from "@huggingface/transformers";
+
+// Disable transformers.js's local-models lookup — we only load from the HF
+// CDN so the worker doesn't try to fetch files from our own origin.
+env.allowLocalModels = false;
+env.allowRemoteModels = true;
+
+type InitMessage = { type: "init"; model: string };
+type TranscribeMessage = { type: "transcribe"; audio: Float32Array; language?: string };
+type DisposeMessage = { type: "dispose" };
+type InMessage = InitMessage | TranscribeMessage | DisposeMessage;
+
+type OutMessage =
+ | { type: "progress"; stage: string; progress: number }
+ | { type: "ready" }
+ | { type: "result"; text: string }
+ | { type: "error"; message: string };
+
+const ctx = self as unknown as DedicatedWorkerGlobalScope;
+
+let asrPipeline: AutomaticSpeechRecognitionPipeline | null = null;
+let activeModelId: string | null = null;
+
+function post(message: OutMessage) {
+ ctx.postMessage(message);
+}
+
+type ProgressEvent = {
+ status?: string;
+ file?: string;
+ progress?: number;
+};
+
+async function handleInit(msg: InitMessage) {
+ if (asrPipeline && activeModelId === msg.model) {
+ post({ type: "ready" });
+ return;
+ }
+ if (asrPipeline) {
+ await asrPipeline.dispose();
+ asrPipeline = null;
+ }
+ try {
+ // dtype choice rationale: the `_quantized` / `q8` and `q4` decoder weights
+ // for whisper-base both contain `MatMulNBits` ops that only execute on
+ // WebGPU. On browsers without WebGPU (most Firefox, older Chrome) onnxruntime
+ // throws `Missing required scale: ... weight_merged_0_scale`. fp16 has no
+ // quantized ops at all so it works on both WASM and WebGPU; it's ~half the
+ // size of fp32 with no perceptible accuracy loss for ASR.
+ const created = await pipeline("automatic-speech-recognition", msg.model, {
+ dtype: {
+ encoder_model: "fp32",
+ decoder_model_merged: "fp16",
+ },
+ progress_callback: (e: ProgressEvent) => {
+ if (typeof e?.progress === "number") {
+ post({
+ type: "progress",
+ stage: e.status ?? "download",
+ progress: e.progress,
+ });
+ }
+ },
+ });
+ asrPipeline = created as AutomaticSpeechRecognitionPipeline;
+ activeModelId = msg.model;
+ post({ type: "ready" });
+ } catch (err) {
+ post({ type: "error", message: errorMessage(err) });
+ }
+}
+
+async function handleTranscribe(msg: TranscribeMessage) {
+ if (!asrPipeline) {
+ post({ type: "error", message: "Whisper worker not initialized" });
+ return;
+ }
+ try {
+ const result = (await asrPipeline(msg.audio, {
+ language: msg.language && msg.language !== "auto" ? msg.language : undefined,
+ task: "transcribe",
+ })) as { text?: string } | Array<{ text?: string }>;
+ const text = Array.isArray(result)
+ ? result.map((r) => r.text ?? "").join(" ")
+ : (result.text ?? "");
+ post({ type: "result", text: text.trim() });
+ } catch (err) {
+ post({ type: "error", message: errorMessage(err) });
+ }
+}
+
+async function handleDispose() {
+ if (asrPipeline) {
+ await asrPipeline.dispose();
+ asrPipeline = null;
+ activeModelId = null;
+ }
+}
+
+function errorMessage(err: unknown): string {
+ if (err instanceof Error) return err.message;
+ return String(err);
+}
+
+ctx.addEventListener("message", (event: MessageEvent) => {
+ const msg = event.data;
+ switch (msg.type) {
+ case "init":
+ void handleInit(msg);
+ break;
+ case "transcribe":
+ void handleTranscribe(msg);
+ break;
+ case "dispose":
+ void handleDispose();
+ break;
+ }
+});