From feb9f3553ff5110c5382aa272ae297a7673413bc Mon Sep 17 00:00:00 2001 From: Kandev Agent Date: Fri, 29 May 2026 17:22:55 +0100 Subject: [PATCH 1/4] feat: add voice mode for chat input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a voice input button to the chat composer with three transcription engines: in-browser Web Speech API, in-browser Whisper via @huggingface/transformers, and a server-side OpenAI Whisper fallback gated on KANDEV_VOICE_OPENAI_API_KEY. User-configurable from a new /settings/voice-mode page: - Master on/off toggle (defaults to on; opt-out, not opt-in) - Engine preference (auto / Web Speech / Whisper Web / Whisper Server) - Language hint (auto-detect or BCP-47 pinned) - Activation mode (click-to-toggle or hold-to-talk) - Auto-send after transcription - Whisper Web model size (tiny / base / small) - Editable Cmd+Shift+M keyboard shortcut Voice settings live under user_settings.voice_mode as a nested JSON object — no schema migration. Setting changes broadcast via the existing user.settings.updated WS event so any open chat tab updates live. The mic button is rendered on the right of the composer alongside the send button (round primary fill; destructive fill with pulse ring while recording). On browsers without a usable engine (e.g. iOS Chrome over HTTP) the button stays visible as a muted greyed icon — tapping it surfaces a toast with the reason (typically a missing secure context). Whisper Web runs in a Web Worker with fp16 decoder weights (avoids the MatMulNBits scale-missing crash that q4/q8 hit on WASM). Models are pulled from the onnx-community/* mirror and cached by transformers.js in the Cache Storage on first use. --- apps/backend/cmd/kandev/helpers.go | 8 + apps/backend/cmd/kandev/main.go | 1 + apps/backend/internal/common/config/config.go | 19 + .../internal/user/controller/controller.go | 1 + apps/backend/internal/user/dto/dto.go | 3 + apps/backend/internal/user/models/models.go | 26 + apps/backend/internal/user/service/service.go | 65 +++ .../internal/user/service/service_test.go | 99 ++++ apps/backend/internal/user/store/sqlite.go | 56 ++ .../voice/handlers/transcribe_handlers.go | 109 ++++ .../handlers/transcribe_handlers_test.go | 157 ++++++ .../internal/voice/transcribe/service.go | 185 +++++++ .../internal/voice/transcribe/service_test.go | 217 ++++++++ apps/pnpm-lock.yaml | 267 +++++++++- apps/web/app/settings/voice-mode/page.tsx | 21 + .../settings/editors-settings-state.tsx | 3 +- .../settings/keyboard-shortcuts-card.tsx | 2 +- .../settings/settings-app-sidebar.tsx | 114 +++-- .../settings/voice-mode-settings.tsx | 482 ++++++++++++++++++ .../components/task/chat/chat-input-body.tsx | 7 + .../task/chat/chat-input-container.tsx | 34 ++ .../task/chat/chat-input-toolbar.tsx | 22 +- .../task/chat/voice-input-button.tsx | 265 ++++++++++ apps/web/hooks/use-user-display-settings.ts | 26 +- apps/web/hooks/use-voice-input.test.ts | 199 ++++++++ apps/web/hooks/use-voice-input.ts | 471 +++++++++++++++++ apps/web/lib/api/domains/settings-api.ts | 2 + apps/web/lib/api/domains/voice-api.test.ts | 63 +++ apps/web/lib/api/domains/voice-api.ts | 49 ++ apps/web/lib/keyboard/constants.ts | 6 + .../lib/keyboard/shortcut-overrides.test.ts | 3 +- apps/web/lib/keyboard/shortcut-overrides.ts | 4 +- apps/web/lib/ssr/user-settings.test.ts | 82 ++- apps/web/lib/ssr/user-settings.ts | 27 + .../state/slices/settings/settings-slice.ts | 3 +- apps/web/lib/state/slices/settings/types.ts | 25 + apps/web/lib/types/backend.ts | 1 + apps/web/lib/types/http-voice.ts | 17 + apps/web/lib/types/http.ts | 3 + apps/web/lib/voice/capabilities.test.ts | 97 ++++ apps/web/lib/voice/capabilities.ts | 75 +++ apps/web/lib/voice/whisper-web-client.ts | 186 +++++++ apps/web/lib/voice/whisper-web-models.ts | 41 ++ apps/web/lib/ws/handlers/users.ts | 2 + apps/web/package.json | 1 + apps/web/workers/whisper-web.worker.ts | 138 +++++ 46 files changed, 3608 insertions(+), 76 deletions(-) create mode 100644 apps/backend/internal/voice/handlers/transcribe_handlers.go create mode 100644 apps/backend/internal/voice/handlers/transcribe_handlers_test.go create mode 100644 apps/backend/internal/voice/transcribe/service.go create mode 100644 apps/backend/internal/voice/transcribe/service_test.go create mode 100644 apps/web/app/settings/voice-mode/page.tsx create mode 100644 apps/web/components/settings/voice-mode-settings.tsx create mode 100644 apps/web/components/task/chat/voice-input-button.tsx create mode 100644 apps/web/hooks/use-voice-input.test.ts create mode 100644 apps/web/hooks/use-voice-input.ts create mode 100644 apps/web/lib/api/domains/voice-api.test.ts create mode 100644 apps/web/lib/api/domains/voice-api.ts create mode 100644 apps/web/lib/types/http-voice.ts create mode 100644 apps/web/lib/voice/capabilities.test.ts create mode 100644 apps/web/lib/voice/capabilities.ts create mode 100644 apps/web/lib/voice/whisper-web-client.ts create mode 100644 apps/web/lib/voice/whisper-web-models.ts create mode 100644 apps/web/workers/whisper-web.worker.ts diff --git a/apps/backend/cmd/kandev/helpers.go b/apps/backend/cmd/kandev/helpers.go index 58eeec1f5..dccbb750d 100644 --- a/apps/backend/cmd/kandev/helpers.go +++ b/apps/backend/cmd/kandev/helpers.go @@ -71,6 +71,8 @@ import ( userhandlers "github.com/kandev/kandev/internal/user/handlers" utilitycontroller "github.com/kandev/kandev/internal/utility/controller" utilityhandlers "github.com/kandev/kandev/internal/utility/handlers" + voicehandlers "github.com/kandev/kandev/internal/voice/handlers" + "github.com/kandev/kandev/internal/voice/transcribe" workflowcontroller "github.com/kandev/kandev/internal/workflow/controller" workflowhandlers "github.com/kandev/kandev/internal/workflow/handlers" "github.com/kandev/kandev/internal/worktree" @@ -449,6 +451,7 @@ type routeParams struct { devMode bool httpPort int features config.FeaturesConfig + voice config.VoiceConfig log *logger.Logger } @@ -698,6 +701,11 @@ func registerSecondaryRoutes( utilityhandlers.RegisterRoutes(p.router, p.utilityCtrl, p.lifecycleMgr, p.hostUtilityMgr, p.services.User, p.log) p.log.Debug("Registered Utility Agents handlers (HTTP)") + // Voice transcription fallback. The route always mounts, but returns 503 + // when no API key is configured so the frontend can hide the path. + voicehandlers.RegisterRoutes(p.router, transcribe.New(p.voice.OpenAIAPIKey), p.log) + p.log.Debug("Registered Voice handlers (HTTP)") + agentcapabilities.RegisterRoutes(p.router, p.hostUtilityMgr, p.log) p.log.Debug("Registered Agent Capabilities handlers (HTTP)") diff --git a/apps/backend/cmd/kandev/main.go b/apps/backend/cmd/kandev/main.go index fa3070382..4f771448c 100644 --- a/apps/backend/cmd/kandev/main.go +++ b/apps/backend/cmd/kandev/main.go @@ -1513,6 +1513,7 @@ func buildHTTPServer( devMode: cfg.Debug.DevMode || cfg.Debug.PprofEnabled, httpPort: port, features: cfg.Features, + voice: cfg.Voice, log: log, }) diff --git a/apps/backend/internal/common/config/config.go b/apps/backend/internal/common/config/config.go index 2d76bcca4..9028d3dce 100644 --- a/apps/backend/internal/common/config/config.go +++ b/apps/backend/internal/common/config/config.go @@ -41,6 +41,7 @@ type Config struct { RepoClone RepoCloneConfig `mapstructure:"repoClone"` Debug DebugConfig `mapstructure:"debug"` Office OfficeConfig `mapstructure:"office"` + Voice VoiceConfig `mapstructure:"voice"` Features FeaturesConfig `mapstructure:"features"` } @@ -147,6 +148,20 @@ type OfficeConfig struct { JWTSigningKey string `mapstructure:"jwtSigningKey"` } +// VoiceConfig holds configuration for the chat voice-input transcription +// fallback. The primary voice-input engine runs entirely in the browser +// (Web Speech API); this server-side fallback is only used when the browser +// has no SpeechRecognition support (e.g. Firefox). +// +// When OpenAIAPIKey is empty the /api/v1/transcribe endpoint returns 503 +// and the frontend hides the fallback path, so the feature is safe to +// ship un-configured. +type VoiceConfig struct { + // OpenAIAPIKey is the API key used to call OpenAI's Whisper transcription + // endpoint. Set via KANDEV_VOICE_OPENAI_API_KEY. + OpenAIAPIKey string `mapstructure:"openAIApiKey"` +} + // FeaturesConfig is the central registry of runtime feature flags. Every flag // defaults to false so production binaries ship with new work hidden until a // deployment explicitly opts in (env var, e.g. KANDEV_FEATURES_OFFICE=true). @@ -312,6 +327,9 @@ func setDefaults(v *viper.Viper) { // Office defaults v.SetDefault("office.jwtSigningKey", "") + // Voice defaults + v.SetDefault("voice.openAIApiKey", "") + // Feature-flag defaults live in ./features.yaml (symlinked to // apps/backend/internal/features/features.yaml). LoadWithPath applies // them via features.ApplyDefaults after this function returns so the @@ -428,6 +446,7 @@ func LoadWithPath(configPath string) (*Config, error) { _ = v.BindEnv("events.namespace", "KANDEV_EVENTS_NAMESPACE") _ = v.BindEnv("debug.devMode", "KANDEV_DEBUG_DEV_MODE") _ = v.BindEnv("debug.pprofEnabled", "KANDEV_DEBUG_PPROF_ENABLED") + _ = v.BindEnv("voice.openAIApiKey", "KANDEV_VOICE_OPENAI_API_KEY") // Configure config file v.SetConfigName("config") diff --git a/apps/backend/internal/user/controller/controller.go b/apps/backend/internal/user/controller/controller.go index 2b6cb8c8f..1ca49c98e 100644 --- a/apps/backend/internal/user/controller/controller.go +++ b/apps/backend/internal/user/controller/controller.go @@ -68,6 +68,7 @@ func (c *Controller) UpdateUserSettings(ctx context.Context, req dto.UpdateUserS TerminalFontFamily: req.TerminalFontFamily, TerminalFontSize: req.TerminalFontSize, ChangesPanelLayout: req.ChangesPanelLayout, + VoiceMode: req.VoiceMode, }) if err != nil { return dto.UserSettingsResponse{}, err diff --git a/apps/backend/internal/user/dto/dto.go b/apps/backend/internal/user/dto/dto.go index 450f11b2d..3329aeb1a 100644 --- a/apps/backend/internal/user/dto/dto.go +++ b/apps/backend/internal/user/dto/dto.go @@ -39,6 +39,7 @@ type UserSettingsDTO struct { TerminalFontFamily string `json:"terminal_font_family"` TerminalFontSize int `json:"terminal_font_size"` ChangesPanelLayout string `json:"changes_panel_layout"` + VoiceMode models.VoiceModeSettings `json:"voice_mode"` UpdatedAt string `json:"updated_at"` } @@ -82,6 +83,7 @@ type UpdateUserSettingsRequest struct { TerminalFontFamily *string `json:"terminal_font_family,omitempty"` TerminalFontSize *int `json:"terminal_font_size,omitempty"` ChangesPanelLayout *string `json:"changes_panel_layout,omitempty"` + VoiceMode *models.VoiceModeSettings `json:"voice_mode,omitempty"` } func FromUser(user *models.User) UserDTO { @@ -120,6 +122,7 @@ func FromUserSettings(settings *models.UserSettings) UserSettingsDTO { TerminalFontFamily: settings.TerminalFontFamily, TerminalFontSize: settings.TerminalFontSize, ChangesPanelLayout: settings.ChangesPanelLayout, + VoiceMode: settings.VoiceMode, UpdatedAt: settings.UpdatedAt.Format(time.RFC3339), } } diff --git a/apps/backend/internal/user/models/models.go b/apps/backend/internal/user/models/models.go index 4b48a5ff4..80475e904 100644 --- a/apps/backend/internal/user/models/models.go +++ b/apps/backend/internal/user/models/models.go @@ -38,10 +38,36 @@ type UserSettings struct { TerminalFontFamily string `json:"terminal_font_family"` TerminalFontSize int `json:"terminal_font_size"` ChangesPanelLayout string `json:"changes_panel_layout"` // "flat" | "tree" + VoiceMode VoiceModeSettings `json:"voice_mode"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } +// VoiceModeSettings is the per-user configuration surface for the chat +// voice-input feature. Stored as a nested JSON object inside the `users.settings` +// blob — adding fields here does not require a schema migration. +type VoiceModeSettings struct { + // Enabled gates the whole feature. When false, the mic button is hidden + // entirely and no voice-related hooks run on the chat input. Defaults to + // true for new users; pre-existing user rows that have no `enabled` field + // in their stored JSON are also treated as enabled (see store layer). + Enabled bool `json:"enabled"` + // Engine is the user's preferred transcription engine. + // "auto" | "webSpeech" | "whisperWeb" | "whisperServer". Default "auto". + Engine string `json:"engine"` + // Language is the BCP-47 tag or "auto" to use the browser's language. + // Examples: "en-US", "pt-PT", "ja-JP". Default "auto". + Language string `json:"language"` + // Mode controls how the mic button is activated: "toggle" (click to start/stop) + // or "hold" (push-to-talk). Default "toggle". + Mode string `json:"mode"` + // AutoSend submits the chat message immediately after the transcript is inserted. + AutoSend bool `json:"auto_send"` + // WhisperWebModel selects the in-browser Whisper model when engine = whisperWeb. + // "tiny" | "base" | "small". Default "base". + WhisperWebModel string `json:"whisper_web_model"` +} + // SavedLayout represents a user-saved dockview layout configuration. type SavedLayout struct { ID string `json:"id"` diff --git a/apps/backend/internal/user/service/service.go b/apps/backend/internal/user/service/service.go index f83991a86..1a2709335 100644 --- a/apps/backend/internal/user/service/service.go +++ b/apps/backend/internal/user/service/service.go @@ -58,6 +58,7 @@ type UpdateUserSettingsRequest struct { TerminalFontFamily *string TerminalFontSize *int ChangesPanelLayout *string + VoiceMode *models.VoiceModeSettings } func NewService(repo store.Repository, eventBus bus.EventBus, log *logger.Logger) *Service { @@ -122,6 +123,9 @@ func (s *Service) UpdateUserSettings(ctx context.Context, req *UpdateUserSetting if err := applySidebarViews(settings, req); err != nil { return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error()) } + if err := applyVoiceMode(settings, req.VoiceMode); err != nil { + return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error()) + } settings.UpdatedAt = time.Now().UTC() if err := s.repo.UpsertUserSettings(ctx, settings); err != nil { return nil, err @@ -220,6 +224,66 @@ func applyChangesPanelLayout(settings *models.UserSettings, value *string) error return nil } +var ( + validVoiceEngines = map[string]struct{}{ + "auto": {}, + "webSpeech": {}, + "whisperWeb": {}, + "whisperServer": {}, + } + validVoiceModes = map[string]struct{}{ + "toggle": {}, + "hold": {}, + } + validWhisperWebModels = map[string]struct{}{ + "tiny": {}, + "base": {}, + "small": {}, + } +) + +// applyVoiceMode validates the inbound voice-mode settings and merges them +// onto the user record. Each sub-field is validated independently so a +// partial update (e.g. just `engine`) still works. +// +// `enabled` and `auto_send` are plain bools — every PATCH carries them. The +// settings UI always sends the full VoiceMode object so partial updates that +// would otherwise zero these are not a real concern. +func applyVoiceMode(settings *models.UserSettings, value *models.VoiceModeSettings) error { + if value == nil { + return nil + } + current := settings.VoiceMode + if current.Engine == "" { + current.Engine = "auto" + } + if value.Engine != "" { + if _, ok := validVoiceEngines[value.Engine]; !ok { + return errors.New("voice_mode.engine must be 'auto', 'webSpeech', 'whisperWeb', or 'whisperServer'") + } + current.Engine = value.Engine + } + if value.Language != "" { + current.Language = strings.TrimSpace(value.Language) + } + if value.Mode != "" { + if _, ok := validVoiceModes[value.Mode]; !ok { + return errors.New("voice_mode.mode must be 'toggle' or 'hold'") + } + current.Mode = value.Mode + } + if value.WhisperWebModel != "" { + if _, ok := validWhisperWebModels[value.WhisperWebModel]; !ok { + return errors.New("voice_mode.whisper_web_model must be 'tiny', 'base', or 'small'") + } + current.WhisperWebModel = value.WhisperWebModel + } + current.AutoSend = value.AutoSend + current.Enabled = value.Enabled + settings.VoiceMode = current + return nil +} + // applyChatSubmitKey validates and applies the chat_submit_key setting. func (s *Service) applyChatSubmitKey(settings *models.UserSettings, req *UpdateUserSettingsRequest) error { if req.ChatSubmitKey == nil { @@ -332,6 +396,7 @@ func (s *Service) publishUserSettingsEvent(ctx context.Context, settings *models "terminal_font_family": settings.TerminalFontFamily, "terminal_font_size": settings.TerminalFontSize, "changes_panel_layout": settings.ChangesPanelLayout, + "voice_mode": settings.VoiceMode, "updated_at": settings.UpdatedAt.Format(time.RFC3339), } if err := s.eventBus.Publish(ctx, events.UserSettingsUpdated, bus.NewEvent(events.UserSettingsUpdated, "user-service", data)); err != nil { diff --git a/apps/backend/internal/user/service/service_test.go b/apps/backend/internal/user/service/service_test.go index 1e3a1efdb..ae01c50d6 100644 --- a/apps/backend/internal/user/service/service_test.go +++ b/apps/backend/internal/user/service/service_test.go @@ -441,3 +441,102 @@ func TestApplySidebarViews(t *testing.T) { }) } } + +func TestApplyVoiceMode(t *testing.T) { + t.Run("nil value leaves settings unchanged", func(t *testing.T) { + settings := &models.UserSettings{ + VoiceMode: models.VoiceModeSettings{Engine: "webSpeech", Language: "en-US"}, + } + if err := applyVoiceMode(settings, nil); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if settings.VoiceMode.Engine != "webSpeech" || settings.VoiceMode.Language != "en-US" { + t.Fatalf("expected unchanged, got %+v", settings.VoiceMode) + } + }) + + t.Run("happy path: applies a full update", func(t *testing.T) { + settings := &models.UserSettings{} + err := applyVoiceMode(settings, &models.VoiceModeSettings{ + Enabled: true, + Engine: "whisperWeb", + Language: "pt-PT", + Mode: "hold", + AutoSend: true, + WhisperWebModel: "small", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + want := models.VoiceModeSettings{ + Enabled: true, + Engine: "whisperWeb", + Language: "pt-PT", + Mode: "hold", + AutoSend: true, + WhisperWebModel: "small", + } + if settings.VoiceMode != want { + t.Fatalf("expected %+v, got %+v", want, settings.VoiceMode) + } + }) + + t.Run("enabled=false is honored (user disabled the feature)", func(t *testing.T) { + settings := &models.UserSettings{VoiceMode: models.VoiceModeSettings{Enabled: true}} + if err := applyVoiceMode(settings, &models.VoiceModeSettings{Enabled: false}); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if settings.VoiceMode.Enabled { + t.Fatalf("expected Enabled=false after disable, got true") + } + }) + + t.Run("invalid engine is rejected", func(t *testing.T) { + err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Engine: "bogus"}) + if err == nil || !strings.Contains(err.Error(), "voice_mode.engine") { + t.Fatalf("expected engine validation error, got %v", err) + } + }) + + t.Run("invalid mode is rejected", func(t *testing.T) { + err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Mode: "tap"}) + if err == nil || !strings.Contains(err.Error(), "voice_mode.mode") { + t.Fatalf("expected mode validation error, got %v", err) + } + }) + + t.Run("invalid whisper_web_model is rejected", func(t *testing.T) { + err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{WhisperWebModel: "huge"}) + if err == nil || !strings.Contains(err.Error(), "voice_mode.whisper_web_model") { + t.Fatalf("expected model validation error, got %v", err) + } + }) + + t.Run("partial update preserves untouched fields", func(t *testing.T) { + settings := &models.UserSettings{ + VoiceMode: models.VoiceModeSettings{ + Engine: "whisperServer", + Language: "en-GB", + Mode: "toggle", + WhisperWebModel: "tiny", + }, + } + // Empty strings on the new payload mean "no change" for those fields. + err := applyVoiceMode(settings, &models.VoiceModeSettings{Engine: "webSpeech"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if settings.VoiceMode.Engine != "webSpeech" { + t.Fatalf("expected engine=webSpeech, got %q", settings.VoiceMode.Engine) + } + if settings.VoiceMode.Language != "en-GB" { + t.Fatalf("expected language preserved, got %q", settings.VoiceMode.Language) + } + if settings.VoiceMode.Mode != "toggle" { + t.Fatalf("expected mode preserved, got %q", settings.VoiceMode.Mode) + } + if settings.VoiceMode.WhisperWebModel != "tiny" { + t.Fatalf("expected whisper model preserved, got %q", settings.VoiceMode.WhisperWebModel) + } + }) +} diff --git a/apps/backend/internal/user/store/sqlite.go b/apps/backend/internal/user/store/sqlite.go index 954c9f93b..6671caf8c 100644 --- a/apps/backend/internal/user/store/sqlite.go +++ b/apps/backend/internal/user/store/sqlite.go @@ -162,6 +162,7 @@ func (r *sqliteRepository) UpsertUserSettings(ctx context.Context, settings *mod "terminal_font_family": settings.TerminalFontFamily, "terminal_font_size": settings.TerminalFontSize, "changes_panel_layout": settings.ChangesPanelLayout, + "voice_mode": settings.VoiceMode, }) if err != nil { return err @@ -192,6 +193,58 @@ func scanUser(scanner interface{ Scan(dest ...any) error }) (*models.User, error return user, nil } +// defaultVoiceModeSettings returns the baseline VoiceMode configuration for +// users with no saved preferences. Mirrored on the frontend; keep in sync. +func defaultVoiceModeSettings() models.VoiceModeSettings { + return models.VoiceModeSettings{ + Enabled: true, + Engine: "auto", + Language: "auto", + Mode: "toggle", + AutoSend: false, + WhisperWebModel: "base", + } +} + +// storedVoiceMode is the on-disk JSON shape — uses *bool for `enabled` so we +// can distinguish "absent" (older rows written before the toggle existed — +// must default to true) from "explicitly false" (user disabled the feature). +type storedVoiceMode struct { + Enabled *bool `json:"enabled"` + Engine string `json:"engine"` + Language string `json:"language"` + Mode string `json:"mode"` + AutoSend bool `json:"auto_send"` + WhisperWebModel string `json:"whisper_web_model"` +} + +// mergeVoiceModeDefaults fills in zero/missing fields on a stored VoiceMode +// payload so older user rows (written before VoiceMode existed) still produce +// usable settings instead of empty strings the frontend would reject. +func mergeVoiceModeDefaults(stored *storedVoiceMode) models.VoiceModeSettings { + out := defaultVoiceModeSettings() + if stored == nil { + return out + } + if stored.Enabled != nil { + out.Enabled = *stored.Enabled + } + if stored.Engine != "" { + out.Engine = stored.Engine + } + if stored.Language != "" { + out.Language = stored.Language + } + if stored.Mode != "" { + out.Mode = stored.Mode + } + if stored.WhisperWebModel != "" { + out.WhisperWebModel = stored.WhisperWebModel + } + out.AutoSend = stored.AutoSend + return out +} + func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID string) (*models.UserSettings, error) { settings := &models.UserSettings{} var settingsRaw string @@ -208,6 +261,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin settings.TerminalLinkBehavior = "new_tab" settings.ChangesPanelLayout = "flat" settings.SidebarViews = []models.SidebarView{} + settings.VoiceMode = defaultVoiceModeSettings() return settings, nil } var payload struct { @@ -235,6 +289,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin TerminalFontFamily string `json:"terminal_font_family"` TerminalFontSize int `json:"terminal_font_size"` ChangesPanelLayout string `json:"changes_panel_layout"` + VoiceMode *storedVoiceMode `json:"voice_mode"` } if err := json.Unmarshal([]byte(settingsRaw), &payload); err != nil { return nil, err @@ -294,6 +349,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin } settings.TerminalFontFamily = payload.TerminalFontFamily settings.TerminalFontSize = payload.TerminalFontSize + settings.VoiceMode = mergeVoiceModeDefaults(payload.VoiceMode) if payload.ChangesPanelLayout == "tree" { settings.ChangesPanelLayout = "tree" } else { diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers.go b/apps/backend/internal/voice/handlers/transcribe_handlers.go new file mode 100644 index 000000000..517ad2e26 --- /dev/null +++ b/apps/backend/internal/voice/handlers/transcribe_handlers.go @@ -0,0 +1,109 @@ +// Package handlers exposes the HTTP surface for the voice-input transcription +// fallback. The endpoint is unauthenticated (matches /api/v1/features) — the +// Web Speech API path is preferred by the frontend, so this server-side +// fallback only runs when the browser cannot do it locally. +package handlers + +import ( + "errors" + "io" + "net/http" + + "github.com/gin-gonic/gin" + "go.uber.org/zap" + + "github.com/kandev/kandev/internal/common/logger" + "github.com/kandev/kandev/internal/voice/transcribe" +) + +// maxAudioBytes caps the multipart audio payload. Whisper accepts up to 25 MB +// per request; we cap lower so a stuck mic doesn't blow up backend memory or +// burn API spend on a stuck recording. +const maxAudioBytes = 10 * 1024 * 1024 + +// Handlers wires the transcribe service into Gin routes. +type Handlers struct { + svc *transcribe.Service + log *logger.Logger +} + +// NewHandlers constructs a Handlers from a transcribe Service. +func NewHandlers(svc *transcribe.Service, log *logger.Logger) *Handlers { + return &Handlers{ + svc: svc, + log: log.WithFields(zap.String("component", "voice-handlers")), + } +} + +// RegisterRoutes mounts the voice transcription endpoint. +func RegisterRoutes(router *gin.Engine, svc *transcribe.Service, log *logger.Logger) { + h := NewHandlers(svc, log) + api := router.Group("/api/v1") + api.POST("/transcribe", h.httpTranscribe) +} + +func (h *Handlers) httpTranscribe(c *gin.Context) { + if h.svc == nil || !h.svc.Configured() { + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "voice transcription is not configured on this server", + }) + return + } + + // MaxBytesReader makes the io.ReadAll below short-circuit with an error + // once the cap is exceeded, instead of letting Gin buffer the whole body. + c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, maxAudioBytes) + + fh, err := c.FormFile("audio") + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is required (multipart field 'audio')"}) + return + } + + file, err := fh.Open() + if err != nil { + h.log.Warn("open uploaded audio failed", zap.Error(err)) + c.JSON(http.StatusBadRequest, gin.H{"error": "cannot open uploaded audio"}) + return + } + defer func() { _ = file.Close() }() + + data, err := io.ReadAll(file) + if err != nil { + h.log.Warn("read uploaded audio failed", zap.Error(err)) + c.JSON(http.StatusRequestEntityTooLarge, gin.H{"error": "audio payload too large or unreadable"}) + return + } + if len(data) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is empty"}) + return + } + + mime := fh.Header.Get("Content-Type") + text, err := h.svc.Transcribe(c.Request.Context(), data, mime, fh.Filename) + if err != nil { + h.respondError(c, err) + return + } + c.JSON(http.StatusOK, gin.H{"text": text}) +} + +func (h *Handlers) respondError(c *gin.Context, err error) { + if errors.Is(err, transcribe.ErrNotConfigured) { + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "voice transcription is not configured on this server", + }) + return + } + var upstream *transcribe.UpstreamError + if errors.As(err, &upstream) { + h.log.Warn("whisper upstream error", + zap.Int("status", upstream.StatusCode), + zap.String("body", upstream.Body), + ) + c.JSON(http.StatusBadGateway, gin.H{"error": "upstream transcription error"}) + return + } + h.log.Error("transcription failed", zap.Error(err)) + c.JSON(http.StatusInternalServerError, gin.H{"error": "transcription failed"}) +} diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers_test.go b/apps/backend/internal/voice/handlers/transcribe_handlers_test.go new file mode 100644 index 000000000..aa3170578 --- /dev/null +++ b/apps/backend/internal/voice/handlers/transcribe_handlers_test.go @@ -0,0 +1,157 @@ +package handlers + +import ( + "bytes" + "encoding/json" + "mime/multipart" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/gin-gonic/gin" + + "github.com/kandev/kandev/internal/common/logger" + "github.com/kandev/kandev/internal/voice/transcribe" +) + +func init() { + gin.SetMode(gin.TestMode) +} + +func testLogger(t *testing.T) *logger.Logger { + t.Helper() + log, err := logger.NewLogger(logger.LoggingConfig{Level: "error", Format: "text", OutputPath: "stderr"}) + if err != nil { + t.Fatalf("logger.NewLogger: %v", err) + } + return log +} + +func buildAudioRequest(t *testing.T, field, filename, mime string, data []byte) (*http.Request, string) { + t.Helper() + buf := &bytes.Buffer{} + w := multipart.NewWriter(buf) + if data != nil { + fw, err := createFormFile(w, field, filename, mime) + if err != nil { + t.Fatal(err) + } + _, _ = fw.Write(data) + } + _ = w.Close() + req := httptest.NewRequest(http.MethodPost, "/api/v1/transcribe", buf) + req.Header.Set("Content-Type", w.FormDataContentType()) + return req, w.FormDataContentType() +} + +func createFormFile(w *multipart.Writer, field, filename, mime string) (interface{ Write([]byte) (int, error) }, error) { + if mime == "" { + return w.CreateFormFile(field, filename) + } + hdr := make(map[string][]string) + hdr["Content-Disposition"] = []string{"form-data; name=\"" + field + "\"; filename=\"" + filename + "\""} + hdr["Content-Type"] = []string{mime} + return w.CreatePart(hdr) +} + +func newRouter(svc *transcribe.Service, t *testing.T) *gin.Engine { + r := gin.New() + RegisterRoutes(r, svc, testLogger(t)) + return r +} + +func TestTranscribe_NotConfigured(t *testing.T) { + svc := transcribe.New("") + r := newRouter(svc, t) + + req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte("hello")) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("status = %d, want 503; body=%s", w.Code, w.Body.String()) + } +} + +func TestTranscribe_MissingFile(t *testing.T) { + svc := transcribe.New("sk-test") + r := newRouter(svc, t) + + // No file part — just an empty form. + buf := &bytes.Buffer{} + w := multipart.NewWriter(buf) + _ = w.Close() + req := httptest.NewRequest(http.MethodPost, "/api/v1/transcribe", buf) + req.Header.Set("Content-Type", w.FormDataContentType()) + + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want 400; body=%s", rr.Code, rr.Body.String()) + } +} + +func TestTranscribe_EmptyAudio(t *testing.T) { + svc := transcribe.New("sk-test") + r := newRouter(svc, t) + + req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte{}) + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want 400; body=%s", rr.Code, rr.Body.String()) + } +} + +func TestTranscribe_Success(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Header.Get("Authorization") != "Bearer sk-test" { + t.Errorf("auth header missing") + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"text":"transcribed"}`)) + })) + defer upstream.Close() + + svc := transcribe.New("sk-test", transcribe.WithEndpoint(upstream.URL)) + r := newRouter(svc, t) + + req, _ := buildAudioRequest(t, "audio", "clip.webm", "audio/webm", []byte("bytes")) + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, want 200; body=%s", rr.Code, rr.Body.String()) + } + var body struct { + Text string `json:"text"` + } + if err := json.Unmarshal(rr.Body.Bytes(), &body); err != nil { + t.Fatal(err) + } + if body.Text != "transcribed" { + t.Errorf("text = %q", body.Text) + } +} + +func TestTranscribe_UpstreamError(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{"error":"oops"}`)) + })) + defer upstream.Close() + + svc := transcribe.New("sk-test", transcribe.WithEndpoint(upstream.URL)) + r := newRouter(svc, t) + req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte("bytes")) + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + + if rr.Code != http.StatusBadGateway { + t.Fatalf("status = %d, want 502; body=%s", rr.Code, rr.Body.String()) + } + if !strings.Contains(rr.Body.String(), "upstream") { + t.Errorf("body should mention upstream: %s", rr.Body.String()) + } +} diff --git a/apps/backend/internal/voice/transcribe/service.go b/apps/backend/internal/voice/transcribe/service.go new file mode 100644 index 000000000..fad7b13b9 --- /dev/null +++ b/apps/backend/internal/voice/transcribe/service.go @@ -0,0 +1,185 @@ +// Package transcribe wraps the OpenAI Whisper transcription endpoint for the +// chat voice-input fallback. The browser's Web Speech API is the primary +// voice-input engine; this server-side path is only hit when the browser +// has no SpeechRecognition support. +package transcribe + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "mime/multipart" + "net/http" + "net/textproto" + "strings" + "time" +) + +// ErrNotConfigured is returned when the service has no API key — the handler +// maps this to HTTP 503 so the frontend can hide the Whisper fallback path +// instead of repeatedly retrying a deployment that will never succeed. +var ErrNotConfigured = errors.New("voice transcription is not configured") + +// UpstreamError wraps a non-2xx response from OpenAI so the handler can map +// it to HTTP 502 and surface a clean error to the caller. +type UpstreamError struct { + StatusCode int + Body string +} + +func (e *UpstreamError) Error() string { + return fmt.Sprintf("openai whisper upstream error: status=%d body=%s", e.StatusCode, e.Body) +} + +const ( + defaultEndpoint = "https://api.openai.com/v1/audio/transcriptions" + defaultModel = "whisper-1" + defaultTimeout = 60 * time.Second +) + +// Service transcribes audio via OpenAI's Whisper endpoint. +type Service struct { + apiKey string + endpoint string + model string + client *http.Client +} + +// Option customises a Service for tests (custom endpoint, HTTP client). +type Option func(*Service) + +// WithEndpoint overrides the upstream URL — used by tests with httptest servers. +func WithEndpoint(url string) Option { + return func(s *Service) { s.endpoint = url } +} + +// WithHTTPClient overrides the HTTP client. +func WithHTTPClient(c *http.Client) Option { + return func(s *Service) { s.client = c } +} + +// WithModel overrides the Whisper model name. +func WithModel(model string) Option { + return func(s *Service) { s.model = model } +} + +// New constructs a Service. apiKey may be empty; in that case Transcribe +// returns ErrNotConfigured without making any network calls. +func New(apiKey string, opts ...Option) *Service { + s := &Service{ + apiKey: apiKey, + endpoint: defaultEndpoint, + model: defaultModel, + client: &http.Client{Timeout: defaultTimeout}, + } + for _, o := range opts { + o(s) + } + return s +} + +// Configured reports whether the service has an API key. Used by handlers +// to short-circuit before reading the request body. +func (s *Service) Configured() bool { + return s != nil && strings.TrimSpace(s.apiKey) != "" +} + +// Transcribe sends the given audio bytes to OpenAI Whisper and returns the +// transcribed text. filename is used for the multipart Content-Disposition; +// Whisper relies on the file extension to detect the audio format. +func (s *Service) Transcribe(ctx context.Context, audio []byte, mimeType, filename string) (string, error) { + if !s.Configured() { + return "", ErrNotConfigured + } + if len(audio) == 0 { + return "", errors.New("audio payload is empty") + } + + body, contentType, err := buildMultipart(audio, mimeType, filename, s.model) + if err != nil { + return "", fmt.Errorf("build multipart body: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, s.endpoint, body) + if err != nil { + return "", fmt.Errorf("build whisper request: %w", err) + } + req.Header.Set("Authorization", "Bearer "+s.apiKey) + req.Header.Set("Content-Type", contentType) + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(req) + if err != nil { + return "", fmt.Errorf("call whisper endpoint: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + rawBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", &UpstreamError{StatusCode: resp.StatusCode, Body: string(rawBody)} + } + + var parsed struct { + Text string `json:"text"` + } + if err := json.Unmarshal(rawBody, &parsed); err != nil { + return "", fmt.Errorf("decode whisper response: %w", err) + } + return strings.TrimSpace(parsed.Text), nil +} + +// buildMultipart assembles the multipart/form-data body Whisper expects: +// `file`, `model`, and `response_format=json`. +func buildMultipart(audio []byte, mimeType, filename, model string) (io.Reader, string, error) { + buf := &bytes.Buffer{} + w := multipart.NewWriter(buf) + + if filename == "" { + filename = "recording" + extensionForMime(mimeType) + } + header := textproto.MIMEHeader{} + header.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename=%q`, filename)) + if mimeType != "" { + header.Set("Content-Type", mimeType) + } + filePart, err := w.CreatePart(header) + if err != nil { + return nil, "", err + } + if _, err := filePart.Write(audio); err != nil { + return nil, "", err + } + + if err := w.WriteField("model", model); err != nil { + return nil, "", err + } + if err := w.WriteField("response_format", "json"); err != nil { + return nil, "", err + } + if err := w.Close(); err != nil { + return nil, "", err + } + return buf, w.FormDataContentType(), nil +} + +// extensionForMime maps the audio MIME types MediaRecorder commonly emits to +// the file extensions Whisper recognises. Default to ".webm" — supported by +// Whisper and the most common MediaRecorder default on Chrome. +func extensionForMime(mime string) string { + mime = strings.ToLower(mime) + switch { + case strings.Contains(mime, "wav"): + return ".wav" + case strings.Contains(mime, "mp4"), strings.Contains(mime, "m4a"): + return ".m4a" + case strings.Contains(mime, "mpeg"), strings.Contains(mime, "mp3"): + return ".mp3" + case strings.Contains(mime, "ogg"): + return ".ogg" + default: + return ".webm" + } +} diff --git a/apps/backend/internal/voice/transcribe/service_test.go b/apps/backend/internal/voice/transcribe/service_test.go new file mode 100644 index 000000000..c7eb0f1cb --- /dev/null +++ b/apps/backend/internal/voice/transcribe/service_test.go @@ -0,0 +1,217 @@ +package transcribe + +import ( + "context" + "errors" + "io" + "mime/multipart" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestService_Transcribe_NotConfigured(t *testing.T) { + svc := New("") + _, err := svc.Transcribe(context.Background(), []byte("data"), "audio/webm", "") + if !errors.Is(err, ErrNotConfigured) { + t.Fatalf("expected ErrNotConfigured, got %v", err) + } +} + +func TestService_Configured(t *testing.T) { + if New("").Configured() { + t.Errorf("empty key should not be configured") + } + if New(" ").Configured() { + t.Errorf("whitespace-only key should not be configured") + } + if !New("sk-test").Configured() { + t.Errorf("non-empty key should be configured") + } +} + +func TestService_Transcribe_EmptyAudio(t *testing.T) { + svc := New("sk-test") + _, err := svc.Transcribe(context.Background(), nil, "audio/webm", "") + if err == nil { + t.Fatal("expected error for empty audio") + } +} + +func TestService_Transcribe_Success(t *testing.T) { + var capturedAuth string + var capturedFilename string + var capturedFileBytes []byte + var capturedModel string + var capturedFormat string + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedAuth = r.Header.Get("Authorization") + if err := r.ParseMultipartForm(32 << 20); err != nil { + t.Errorf("parse multipart: %v", err) + } + capturedModel = r.FormValue("model") + capturedFormat = r.FormValue("response_format") + fh := r.MultipartForm.File["file"] + if len(fh) != 1 { + t.Fatalf("expected 1 file part, got %d", len(fh)) + } + capturedFilename = fh[0].Filename + f, err := fh[0].Open() + if err != nil { + t.Fatalf("open file: %v", err) + } + defer func() { _ = f.Close() }() + capturedFileBytes, _ = io.ReadAll(f) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"text":"hello world"}`)) + })) + defer srv.Close() + + svc := New("sk-test", WithEndpoint(srv.URL)) + text, err := svc.Transcribe(context.Background(), []byte("audio-bytes"), "audio/webm", "clip.webm") + if err != nil { + t.Fatalf("Transcribe failed: %v", err) + } + if text != "hello world" { + t.Errorf("unexpected text: %q", text) + } + if capturedAuth != "Bearer sk-test" { + t.Errorf("auth header = %q", capturedAuth) + } + if capturedModel != defaultModel { + t.Errorf("model = %q", capturedModel) + } + if capturedFormat != "json" { + t.Errorf("response_format = %q", capturedFormat) + } + if capturedFilename != "clip.webm" { + t.Errorf("filename = %q", capturedFilename) + } + if string(capturedFileBytes) != "audio-bytes" { + t.Errorf("file body = %q", string(capturedFileBytes)) + } +} + +func TestService_Transcribe_DerivedFilename(t *testing.T) { + var capturedFilename string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = r.ParseMultipartForm(32 << 20) + fh := r.MultipartForm.File["file"] + if len(fh) == 1 { + capturedFilename = fh[0].Filename + } + _, _ = w.Write([]byte(`{"text":""}`)) + })) + defer srv.Close() + + svc := New("sk-test", WithEndpoint(srv.URL)) + _, err := svc.Transcribe(context.Background(), []byte("a"), "audio/wav", "") + if err != nil { + t.Fatal(err) + } + if !strings.HasSuffix(capturedFilename, ".wav") { + t.Errorf("derived filename should use .wav for audio/wav, got %q", capturedFilename) + } +} + +func TestService_Transcribe_UpstreamError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"bad audio"}`)) + })) + defer srv.Close() + + svc := New("sk-test", WithEndpoint(srv.URL)) + _, err := svc.Transcribe(context.Background(), []byte("a"), "audio/webm", "") + var upstream *UpstreamError + if !errors.As(err, &upstream) { + t.Fatalf("expected UpstreamError, got %T: %v", err, err) + } + if upstream.StatusCode != http.StatusBadRequest { + t.Errorf("status = %d", upstream.StatusCode) + } + if !strings.Contains(upstream.Body, "bad audio") { + t.Errorf("body did not contain upstream payload: %q", upstream.Body) + } +} + +func TestExtensionForMime(t *testing.T) { + cases := map[string]string{ + "audio/webm": ".webm", + "audio/wav": ".wav", + "audio/x-wav": ".wav", + "audio/mp4": ".m4a", + "audio/m4a": ".m4a", + "audio/mpeg": ".mp3", + "audio/mp3": ".mp3", + "audio/ogg": ".ogg", + "": ".webm", + "application/anything": ".webm", + } + for mime, want := range cases { + if got := extensionForMime(mime); got != want { + t.Errorf("extensionForMime(%q) = %q, want %q", mime, got, want) + } + } +} + +func TestBuildMultipart_Roundtrip(t *testing.T) { + body, ct, err := buildMultipart([]byte("hello"), "audio/wav", "a.wav", "whisper-1") + if err != nil { + t.Fatal(err) + } + // Parse the multipart body back out using the boundary embedded in ct. + mediaType, params, ok := splitContentType(ct) + if !ok || mediaType != "multipart/form-data" { + t.Fatalf("unexpected content-type: %q", ct) + } + mr := multipart.NewReader(body, params["boundary"]) + fields := map[string]string{} + var fileContent string + for { + part, err := mr.NextPart() + if err == io.EOF { + break + } + if err != nil { + t.Fatal(err) + } + buf, _ := io.ReadAll(part) + if part.FileName() != "" { + fileContent = string(buf) + } else { + fields[part.FormName()] = string(buf) + } + } + if fileContent != "hello" { + t.Errorf("file part = %q", fileContent) + } + if fields["model"] != "whisper-1" { + t.Errorf("model field = %q", fields["model"]) + } + if fields["response_format"] != "json" { + t.Errorf("response_format field = %q", fields["response_format"]) + } +} + +// splitContentType is a tiny helper to split "multipart/form-data; boundary=…" +// without pulling in mime.ParseMediaType — keeps this test file self-contained. +func splitContentType(ct string) (string, map[string]string, bool) { + parts := strings.SplitN(ct, ";", 2) + if len(parts) != 2 { + return "", nil, false + } + mediaType := strings.TrimSpace(parts[0]) + params := map[string]string{} + for _, kv := range strings.Split(parts[1], ";") { + kv = strings.TrimSpace(kv) + eq := strings.IndexByte(kv, '=') + if eq < 0 { + continue + } + params[kv[:eq]] = strings.Trim(kv[eq+1:], `"`) + } + return mediaType, params, true +} diff --git a/apps/pnpm-lock.yaml b/apps/pnpm-lock.yaml index e773c9190..8120436bf 100644 --- a/apps/pnpm-lock.yaml +++ b/apps/pnpm-lock.yaml @@ -246,6 +246,9 @@ importers: '@dnd-kit/utilities': specifier: ^3.2.2 version: 3.2.2(react@19.2.3) + '@huggingface/transformers': + specifier: ^4.2.0 + version: 4.2.0 '@kandev/theme': specifier: workspace:* version: link:../packages/theme @@ -419,7 +422,7 @@ importers: version: 0.55.1 next: specifier: 16.1.7 - version: 16.1.7(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3) + version: 16.1.7(@babel/core@7.28.6)(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3) next-themes: specifier: ^0.4.6 version: 0.4.6(react-dom@19.2.3(react@19.2.3))(react@19.2.3) @@ -473,7 +476,7 @@ importers: version: 2.0.7(react-dom@19.2.3(react@19.2.3))(react@19.2.3) styled-jsx: specifier: 5.1.6 - version: 5.1.6(react@19.2.3) + version: 5.1.6(@babel/core@7.28.6)(react@19.2.3) tailwind-merge: specifier: ^3.4.0 version: 3.4.0 @@ -1406,6 +1409,16 @@ packages: peerDependencies: hono: ^4 + '@huggingface/jinja@0.5.9': + resolution: {integrity: sha512-uWTG+l3VJRsl7EXxYizuL3P+cCPoc3cRqbWWRcQN0FhejRfbdq0RNhCmbY/YDtnTcz9icdLYuLDjsnz4d8JMuw==} + engines: {node: '>=18'} + + '@huggingface/tokenizers@0.1.3': + resolution: {integrity: sha512-8rF/RRT10u+kn7YuUbUg0OF30K8rjTc78aHpxT+qJ1uWSqxT1MHi8+9ltwYfkFYJzT/oS+qw3JVfHtNMGAdqyA==} + + '@huggingface/transformers@4.2.0': + resolution: {integrity: sha512-8BRCoBMH0XsWaEIamuR0LrJGAfftgHAfb2Vrffy0VKlSAE/MnUJ5/h/zTfEP3fDIft+nk7TqB8xXEyABGitBjQ==} + '@humanfs/core@0.19.1': resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==} engines: {node: '>=18.18.0'} @@ -1845,6 +1858,36 @@ packages: engines: {node: '>=18'} hasBin: true + '@protobufjs/aspromise@1.1.2': + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + + '@protobufjs/base64@1.1.2': + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + + '@protobufjs/codegen@2.0.5': + resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==} + + '@protobufjs/eventemitter@1.1.1': + resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==} + + '@protobufjs/fetch@1.1.1': + resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==} + + '@protobufjs/float@1.0.2': + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + + '@protobufjs/inquire@1.1.2': + resolution: {integrity: sha512-pa0vFRuws4wkvaXKK1uXZMAwAX4/t8ANaJo45iw/oQHNQ9q5xUzwgFmVJGXiga2BeN+zpX7Vf9vmsiIa2J+MUw==} + + '@protobufjs/path@1.1.2': + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + + '@protobufjs/pool@1.1.0': + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + + '@protobufjs/utf8@1.1.1': + resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==} + '@radix-ui/number@1.1.1': resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==} @@ -3423,6 +3466,7 @@ packages: '@ungap/structured-clone@1.3.0': resolution: {integrity: sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==} + deprecated: Potential CWE-502 - Update to 1.3.1 or higher '@unrs/resolver-binding-android-arm-eabi@1.11.1': resolution: {integrity: sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==} @@ -3575,6 +3619,10 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + adm-zip@0.5.17: + resolution: {integrity: sha512-+Ut8d9LLqwEvHHJl1+PIHqoyDxFgVN847JTVM3Izi3xHDWPE4UtzzXysMZQs64DMcrJfBeS/uoEP4AD3HQHnQQ==} + engines: {node: '>=12.0'} + agent-base@7.1.4: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} @@ -3707,6 +3755,10 @@ packages: resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} engines: {node: '>=18'} + boolean@3.2.0: + resolution: {integrity: sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==} + deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. + brace-expansion@1.1.12: resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==} @@ -4226,6 +4278,9 @@ packages: detect-node-es@1.1.0: resolution: {integrity: sha512-ypdmJU/TbBby2Dxibuv7ZLW3Bs1QEmM7nHjEANfohJLvE0XVujisn1qPJcZxg+qDucsr+bP6fLD1rPS3AhJ7EQ==} + detect-node@2.1.0: + resolution: {integrity: sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==} + devlop@1.1.0: resolution: {integrity: sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==} @@ -4367,6 +4422,9 @@ packages: resolution: {integrity: sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==} engines: {node: '>= 0.4'} + es6-error@4.1.1: + resolution: {integrity: sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg==} + esbuild@0.21.5: resolution: {integrity: sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==} engines: {node: '>=12'} @@ -4642,6 +4700,9 @@ packages: resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==} engines: {node: '>=16'} + flatbuffers@25.9.23: + resolution: {integrity: sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==} + flatted@3.3.3: resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==} @@ -4765,6 +4826,10 @@ packages: resolution: {integrity: sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==} engines: {node: '>=10.13.0'} + global-agent@3.0.0: + resolution: {integrity: sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==} + engines: {node: '>=10.0'} + global-directory@4.0.1: resolution: {integrity: sha512-wHTUcDUoZ1H5/0iVqEudYW4/kAlN5cZ3j/bXn0Dpbizl9iaUVeWSHqiOjsgk6OW2bkLclbBjzewBz6weQ1zA2Q==} engines: {node: '>=18'} @@ -4792,6 +4857,9 @@ packages: resolution: {integrity: sha512-DKKrynuQRne0PNpEbzuEdHlYOMksHSUI8Zc9Unei5gTsMNA2/vMpoMz/yKba50pejK56qj98qM0SjYxAKi13gQ==} engines: {node: ^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0} + guid-typescript@1.0.9: + resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} + hachure-fill@0.5.2: resolution: {integrity: sha512-3GKBOn+m2LX9iq+JC1064cSFprJY4jL1jCXTcpnfER5HYE2l/4EfWSGzkPa/ZDBmYI0ZOEj5VHV/eKnPGkHuOg==} @@ -5200,6 +5268,9 @@ packages: json-stable-stringify-without-jsonify@1.0.1: resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==} + json-stringify-safe@5.0.1: + resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==} + json5@1.0.2: resolution: {integrity: sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==} hasBin: true @@ -5385,6 +5456,9 @@ packages: resolution: {integrity: sha512-i24m8rpwhmPIS4zscNzK6MSEhk0DUWa/8iYQWxhffV8jkI4Phvs3F+quL5xvS0gdQR0FyTCMMH33Y78dDTzzIw==} engines: {node: '>=18'} + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} @@ -5431,6 +5505,10 @@ packages: engines: {node: '>= 20'} hasBin: true + matcher@3.0.0: + resolution: {integrity: sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==} + engines: {node: '>=10'} + math-intrinsics@1.1.0: resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} engines: {node: '>= 0.4'} @@ -5798,6 +5876,19 @@ packages: oniguruma-to-es@4.3.4: resolution: {integrity: sha512-3VhUGN3w2eYxnTzHn+ikMI+fp/96KoRSVK9/kMTcFqj1NRDh2IhQCKvYxDnWePKRXY/AqH+Fuiyb7VHSzBjHfA==} + onnxruntime-common@1.24.0-dev.20251116-b39e144322: + resolution: {integrity: sha512-BOoomdHYmNRL5r4iQ4bMvsl2t0/hzVQ3OM3PHD0gxeXu1PmggqBv3puZicEUVOA3AtHHYmqZtjMj9FOfGrATTw==} + + onnxruntime-common@1.24.3: + resolution: {integrity: sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA==} + + onnxruntime-node@1.24.3: + resolution: {integrity: sha512-JH7+czbc8ALA819vlTgcV+Q214/+VjGeBHDjX81+ZCD0PCVCIFGFNtT0V4sXG/1JXypKPgScQcB3ij/hk3YnTg==} + os: [win32, darwin, linux] + + onnxruntime-web@1.26.0-dev.20260416-b7804b056c: + resolution: {integrity: sha512-MD6Ss4GSpQBo6zqoJzyT9LRbKYs7x/JVN23FT24EcEvlqF4VuzPOeH6X38orZPKHQDbprn7K+SBpu0/mj2CQiw==} + open@11.0.0: resolution: {integrity: sha512-smsWv2LzFjP03xmvFoJ331ss6h+jixfA4UUV/Bsiyuu4YJPfN+FIQGOIiv4w9/+MoHkfkJ22UIaQWRVFRfH6Vw==} engines: {node: '>=20'} @@ -5911,6 +6002,9 @@ packages: pkg-types@1.3.1: resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==} + platform@1.3.6: + resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} + playwright-core@1.58.2: resolution: {integrity: sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==} engines: {node: '>=18'} @@ -6036,6 +6130,10 @@ packages: prosemirror-view@1.41.5: resolution: {integrity: sha512-UDQbIPnDrjE8tqUBbPmCOZgtd75htE6W3r0JCmY9bL6W1iemDM37MZEKC49d+tdQ0v/CKx4gjxLoLsfkD2NiZA==} + protobufjs@7.6.1: + resolution: {integrity: sha512-4K0myLaWL5EteuSAro91EGFgcfVgxb64Jx+7oDAY6GOkXD4M69yuSEljNcInGVCA5sOPxmZ/EqDLj2x0Q0+Ygg==} + engines: {node: '>=12.0.0'} + proxy-addr@2.0.7: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} @@ -6170,6 +6268,7 @@ packages: recharts@2.15.4: resolution: {integrity: sha512-UT/q6fwS3c1dHbXv2uFgYJ9BMFHu3fwnd7AYZaEQhXuYQ4hgsxLvsUXzGdKeZrW5xopzDCvuA2N41WJ88I7zIw==} engines: {node: '>=14'} + deprecated: 1.x and 2.x branches are no longer active. Bump to Recharts v3 to receive latest features and bugfixes. See https://github.com/recharts/recharts/wiki/3.0-migration-guide peerDependencies: react: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 react-dom: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -6265,6 +6364,10 @@ packages: resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} + roarr@2.15.4: + resolution: {integrity: sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==} + engines: {node: '>=8.0'} + robust-predicates@3.0.2: resolution: {integrity: sha512-IXgzBWvWQwE6PrDI05OvmXUIruQTcoMDzRsOd5CDvHCVLcLHMTSYvOK5Cm46kWqlV3yAbuSpBZdJ5oP5OUoStg==} @@ -6315,6 +6418,9 @@ packages: resolution: {integrity: sha512-3A6sD0WYP7+QrjbfNA2FN3FsOaGGFoekCVgTyypy53gPxhbkCIjtO6YWgdrfM+n/8sI8JeXZOIxsHjMTNxQ4nQ==} engines: {node: ^14.0.0 || >=16.0.0} + semver-compare@1.0.0: + resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==} + semver@6.3.1: resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} hasBin: true @@ -6328,6 +6434,10 @@ packages: resolution: {integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==} engines: {node: '>= 18'} + serialize-error@7.0.1: + resolution: {integrity: sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==} + engines: {node: '>=10'} + serve-static@2.2.1: resolution: {integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==} engines: {node: '>= 18'} @@ -6416,6 +6526,9 @@ packages: resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==} engines: {node: '>= 10.x'} + sprintf-js@1.1.3: + resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==} + stable-hash@0.0.5: resolution: {integrity: sha512-+L3ccpzibovGXFK+Ap/f8LOS0ahMrHTf3xu7mMLSpEGU0EO9ucaysSylKo9eRDFNhWve/y275iPmIZ4z39a9iA==} @@ -6664,6 +6777,10 @@ packages: resolution: {integrity: sha512-Acylog8/luQ8L7il+geoSxhEkazvkslg7PSNKOX59mbB9cOveP5aq9h74Y7YU8yDpJwetzQQrfIwtf4Wp4LKcw==} engines: {node: '>=4'} + type-fest@0.13.1: + resolution: {integrity: sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==} + engines: {node: '>=10'} + type-fest@5.4.0: resolution: {integrity: sha512-wfkA6r0tBpVfGiyO+zbf9e10QkRQSlK9F2UvyfnjoCmrvH2bjHyhPzhugSBOuq1dog3P0+FKckqe+Xf6WKVjwg==} engines: {node: '>=20'} @@ -7914,6 +8031,18 @@ snapshots: dependencies: hono: 4.11.3 + '@huggingface/jinja@0.5.9': {} + + '@huggingface/tokenizers@0.1.3': {} + + '@huggingface/transformers@4.2.0': + dependencies: + '@huggingface/jinja': 0.5.9 + '@huggingface/tokenizers': 0.1.3 + onnxruntime-node: 1.24.3 + onnxruntime-web: 1.26.0-dev.20260416-b7804b056c + sharp: 0.34.5 + '@humanfs/core@0.19.1': {} '@humanfs/node@0.16.7': @@ -7933,8 +8062,7 @@ snapshots: '@iconify/types': 2.0.0 mlly: 1.8.0 - '@img/colour@1.1.0': - optional: true + '@img/colour@1.1.0': {} '@img/sharp-darwin-arm64@0.34.5': optionalDependencies: @@ -8320,6 +8448,28 @@ snapshots: dependencies: playwright: 1.58.2 + '@protobufjs/aspromise@1.1.2': {} + + '@protobufjs/base64@1.1.2': {} + + '@protobufjs/codegen@2.0.5': {} + + '@protobufjs/eventemitter@1.1.1': {} + + '@protobufjs/fetch@1.1.1': + dependencies: + '@protobufjs/aspromise': 1.1.2 + + '@protobufjs/float@1.0.2': {} + + '@protobufjs/inquire@1.1.2': {} + + '@protobufjs/path@1.1.2': {} + + '@protobufjs/pool@1.1.0': {} + + '@protobufjs/utf8@1.1.1': {} + '@radix-ui/number@1.1.1': {} '@radix-ui/primitive@1.1.3': {} @@ -10077,6 +10227,8 @@ snapshots: acorn@8.15.0: {} + adm-zip@0.5.17: {} + agent-base@7.1.4: {} ajv-formats@3.0.1(ajv@8.17.1): @@ -10230,6 +10382,8 @@ snapshots: transitivePeerDependencies: - supports-color + boolean@3.2.0: {} + brace-expansion@1.1.12: dependencies: balanced-match: 1.0.2 @@ -10742,6 +10896,8 @@ snapshots: detect-node-es@1.1.0: {} + detect-node@2.1.0: {} + devlop@1.1.0: dependencies: dequal: 2.0.3 @@ -10943,6 +11099,8 @@ snapshots: is-date-object: 1.1.0 is-symbol: 1.1.1 + es6-error@4.1.1: {} + esbuild@0.21.5: optionalDependencies: '@esbuild/aix-ppc64': 0.21.5 @@ -11039,8 +11197,8 @@ snapshots: '@next/eslint-plugin-next': 16.1.1 eslint: 9.39.2(jiti@2.6.1) eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) - eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)) + eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)) eslint-plugin-jsx-a11y: 6.10.2(eslint@9.39.2(jiti@2.6.1)) eslint-plugin-react: 7.37.5(eslint@9.39.2(jiti@2.6.1)) eslint-plugin-react-hooks: 7.0.1(eslint@9.39.2(jiti@2.6.1)) @@ -11062,7 +11220,7 @@ snapshots: transitivePeerDependencies: - supports-color - eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)): + eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)): dependencies: '@nolyfill/is-core-module': 1.0.39 debug: 4.4.3 @@ -11073,22 +11231,22 @@ snapshots: tinyglobby: 0.2.15 unrs-resolver: 1.11.1 optionalDependencies: - eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)) transitivePeerDependencies: - supports-color - eslint-module-utils@2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)): + eslint-module-utils@2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)): dependencies: debug: 3.2.7 optionalDependencies: '@typescript-eslint/parser': 8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3) eslint: 9.39.2(jiti@2.6.1) eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)) transitivePeerDependencies: - supports-color - eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)): + eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)): dependencies: '@rtsao/scc': 1.1.0 array-includes: 3.1.9 @@ -11099,7 +11257,7 @@ snapshots: doctrine: 2.1.0 eslint: 9.39.2(jiti@2.6.1) eslint-import-resolver-node: 0.3.9 - eslint-module-utils: 2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-module-utils: 2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)) hasown: 2.0.2 is-core-module: 2.16.1 is-glob: 4.0.3 @@ -11425,6 +11583,8 @@ snapshots: flatted: 3.3.3 keyv: 4.5.4 + flatbuffers@25.9.23: {} + flatted@3.3.3: {} for-each@0.3.5: @@ -11537,6 +11697,15 @@ snapshots: dependencies: is-glob: 4.0.3 + global-agent@3.0.0: + dependencies: + boolean: 3.2.0 + es6-error: 4.1.1 + matcher: 3.0.0 + roarr: 2.15.4 + semver: 7.7.4 + serialize-error: 7.0.1 + global-directory@4.0.1: dependencies: ini: 4.1.1 @@ -11556,6 +11725,8 @@ snapshots: graphql@16.12.0: {} + guid-typescript@1.0.9: {} + hachure-fill@0.5.2: {} happy-dom@20.8.9: @@ -11974,6 +12145,8 @@ snapshots: json-stable-stringify-without-jsonify@1.0.1: {} + json-stringify-safe@5.0.1: {} + json5@1.0.2: dependencies: minimist: 1.2.8 @@ -12127,6 +12300,8 @@ snapshots: chalk: 5.6.2 is-unicode-supported: 1.3.0 + long@5.3.2: {} + longest-streak@3.1.0: {} loose-envify@1.4.0: @@ -12172,6 +12347,10 @@ snapshots: marked@16.4.2: {} + matcher@3.0.0: + dependencies: + escape-string-regexp: 4.0.0 + math-intrinsics@1.1.0: {} mdast-util-find-and-replace@3.0.2: @@ -12655,7 +12834,7 @@ snapshots: react: 19.2.3 react-dom: 19.2.3(react@19.2.3) - next@16.1.7(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3): + next@16.1.7(@babel/core@7.28.6)(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3): dependencies: '@next/env': 16.1.7 '@swc/helpers': 0.5.15 @@ -12664,7 +12843,7 @@ snapshots: postcss: 8.4.31 react: 19.2.3 react-dom: 19.2.3(react@19.2.3) - styled-jsx: 5.1.6(react@19.2.3) + styled-jsx: 5.1.6(@babel/core@7.28.6)(react@19.2.3) optionalDependencies: '@next/swc-darwin-arm64': 16.1.7 '@next/swc-darwin-x64': 16.1.7 @@ -12775,6 +12954,25 @@ snapshots: regex: 6.1.0 regex-recursion: 6.0.2 + onnxruntime-common@1.24.0-dev.20251116-b39e144322: {} + + onnxruntime-common@1.24.3: {} + + onnxruntime-node@1.24.3: + dependencies: + adm-zip: 0.5.17 + global-agent: 3.0.0 + onnxruntime-common: 1.24.3 + + onnxruntime-web@1.26.0-dev.20260416-b7804b056c: + dependencies: + flatbuffers: 25.9.23 + guid-typescript: 1.0.9 + long: 5.3.2 + onnxruntime-common: 1.24.0-dev.20251116-b39e144322 + platform: 1.3.6 + protobufjs: 7.6.1 + open@11.0.0: dependencies: default-browser: 5.4.0 @@ -12894,6 +13092,8 @@ snapshots: mlly: 1.8.0 pathe: 2.0.3 + platform@1.3.6: {} + playwright-core@1.58.2: {} playwright@1.58.2: @@ -13066,6 +13266,21 @@ snapshots: prosemirror-state: 1.4.4 prosemirror-transform: 1.11.0 + protobufjs@7.6.1: + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.5 + '@protobufjs/eventemitter': 1.1.1 + '@protobufjs/fetch': 1.1.1 + '@protobufjs/float': 1.0.2 + '@protobufjs/inquire': 1.1.2 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.1 + '@types/node': 20.19.28 + long: 5.3.2 + proxy-addr@2.0.7: dependencies: forwarded: 0.2.0 @@ -13399,6 +13614,15 @@ snapshots: reusify@1.1.0: {} + roarr@2.15.4: + dependencies: + boolean: 3.2.0 + detect-node: 2.1.0 + globalthis: 1.0.4 + json-stringify-safe: 5.0.1 + semver-compare: 1.0.0 + sprintf-js: 1.1.3 + robust-predicates@3.0.2: {} rollup@4.55.1: @@ -13488,6 +13712,8 @@ snapshots: refa: 0.12.1 regexp-ast-analysis: 0.7.1 + semver-compare@1.0.0: {} + semver@6.3.1: {} semver@7.7.4: {} @@ -13508,6 +13734,10 @@ snapshots: transitivePeerDependencies: - supports-color + serialize-error@7.0.1: + dependencies: + type-fest: 0.13.1 + serve-static@2.2.1: dependencies: encodeurl: 2.0.0 @@ -13615,7 +13845,6 @@ snapshots: '@img/sharp-win32-arm64': 0.34.5 '@img/sharp-win32-ia32': 0.34.5 '@img/sharp-win32-x64': 0.34.5 - optional: true shebang-command@2.0.0: dependencies: @@ -13683,6 +13912,8 @@ snapshots: split2@4.2.0: {} + sprintf-js@1.1.3: {} + stable-hash@0.0.5: {} stackback@0.0.2: {} @@ -13807,10 +14038,12 @@ snapshots: dependencies: inline-style-parser: 0.2.7 - styled-jsx@5.1.6(react@19.2.3): + styled-jsx@5.1.6(@babel/core@7.28.6)(react@19.2.3): dependencies: client-only: 0.0.1 react: 19.2.3 + optionalDependencies: + '@babel/core': 7.28.6 stylis@4.3.6: {} @@ -13926,6 +14159,8 @@ snapshots: type-detect@4.1.0: {} + type-fest@0.13.1: {} + type-fest@5.4.0: dependencies: tagged-tag: 1.0.0 diff --git a/apps/web/app/settings/voice-mode/page.tsx b/apps/web/app/settings/voice-mode/page.tsx new file mode 100644 index 000000000..2bcc1b851 --- /dev/null +++ b/apps/web/app/settings/voice-mode/page.tsx @@ -0,0 +1,21 @@ +import { VoiceModeSettings } from "@/components/settings/voice-mode-settings"; +import { StateProvider } from "@/components/state-provider"; +import { fetchUserSettings } from "@/lib/api"; +import { mapUserSettingsResponse } from "@/lib/ssr/user-settings"; + +export default async function VoiceModeSettingsPage() { + let initialState = {}; + try { + const response = await fetchUserSettings({ cache: "no-store" }); + const mapped = mapUserSettingsResponse(response); + initialState = { userSettings: mapped.loaded ? mapped : undefined }; + } catch { + initialState = {}; + } + + return ( + + + + ); +} diff --git a/apps/web/components/settings/editors-settings-state.tsx b/apps/web/components/settings/editors-settings-state.tsx index 2d2e76e3b..ef891eeb5 100644 --- a/apps/web/components/settings/editors-settings-state.tsx +++ b/apps/web/components/settings/editors-settings-state.tsx @@ -7,7 +7,7 @@ import { createEditor, deleteEditor, updateEditor, updateUserSettings } from "@/ import { useRequest } from "@/lib/http/use-request"; import type { EditorOption } from "@/lib/types/http"; import { type ComboboxOption } from "@/components/combobox"; -import { parseTerminalLinkBehavior } from "@/lib/ssr/user-settings"; +import { parseTerminalLinkBehavior, parseVoiceMode } from "@/lib/ssr/user-settings"; import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire"; import { type EditorFormState, @@ -245,6 +245,7 @@ function buildUserSettingsFromResponse( terminalFontFamily: s.terminal_font_family || null, terminalFontSize: s.terminal_font_size || null, changesPanelLayout: s.changes_panel_layout === "tree" ? ("tree" as const) : ("flat" as const), + voiceMode: parseVoiceMode(s.voice_mode), ...mapEditorSettingsFields(s), }; } diff --git a/apps/web/components/settings/keyboard-shortcuts-card.tsx b/apps/web/components/settings/keyboard-shortcuts-card.tsx index 7b1c363be..adb484e27 100644 --- a/apps/web/components/settings/keyboard-shortcuts-card.tsx +++ b/apps/web/components/settings/keyboard-shortcuts-card.tsx @@ -17,7 +17,7 @@ import { useAppStore } from "@/components/state-provider"; import { useToast } from "@/components/toast-provider"; import { updateUserSettings } from "@/lib/api/domains/settings-api"; -function ShortcutRecorder({ +export function ShortcutRecorder({ shortcutId, current, onChange, diff --git a/apps/web/components/settings/settings-app-sidebar.tsx b/apps/web/components/settings/settings-app-sidebar.tsx index 20993c191..1ea984397 100644 --- a/apps/web/components/settings/settings-app-sidebar.tsx +++ b/apps/web/components/settings/settings-app-sidebar.tsx @@ -11,6 +11,7 @@ import { IconCode, IconCpu, IconKey, + IconMicrophone, IconMessageCircle, IconBrandGithub, IconBrandGitlab, @@ -315,6 +316,47 @@ function ExecutorsSidebarSection({ pathname, executors }: ExecutorsSidebarSectio ); } +type SimpleSidebarEntry = { + href: string; + label: string; + Icon: typeof IconBrandGithub; +}; + +/** + * A short row of single-link sidebar entries (Automations, Prompts, Voice + * Mode, Utility Agents, External MCP) — extracted from `SettingsAppSidebar` + * so the parent function stays under the 100-line lint limit. + */ +function SimpleSidebarRows({ + pathname, + entries, +}: { + pathname: string; + entries: SimpleSidebarEntry[]; +}) { + return ( + <> + {entries.map(({ href, label, Icon }) => ( + + + + + {label} + + + + ))} + + ); +} + function SecretsSidebarSection({ pathname }: { pathname: string }) { return ( @@ -369,57 +411,33 @@ export function SettingsAppSidebar() { - {/* Automations */} - - - - - Automations - - - - + - - {/* Prompts */} - - - - - Prompts - - - - - {/* Utility Agents */} - - - - - Utility Agents - - - - + - - - {/* External MCP */} - - - - - External MCP - - - + {/* System */} diff --git a/apps/web/components/settings/voice-mode-settings.tsx b/apps/web/components/settings/voice-mode-settings.tsx new file mode 100644 index 000000000..610c85fc1 --- /dev/null +++ b/apps/web/components/settings/voice-mode-settings.tsx @@ -0,0 +1,482 @@ +"use client"; + +import { useCallback, useMemo, useState } from "react"; +import { IconAlertTriangle, IconMicrophone } from "@tabler/icons-react"; +import { Badge } from "@kandev/ui/badge"; +import { Card, CardContent, CardHeader, CardTitle } from "@kandev/ui/card"; +import { Label } from "@kandev/ui/label"; +import { RadioGroup, RadioGroupItem } from "@kandev/ui/radio-group"; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectLabel, + SelectTrigger, + SelectValue, +} from "@kandev/ui/select"; +import { Switch } from "@kandev/ui/switch"; +import { useAppStore } from "@/components/state-provider"; +import { useToast } from "@/components/toast-provider"; +import { updateUserSettings } from "@/lib/api"; +import { SettingsSection } from "@/components/settings/settings-section"; +import { ShortcutRecorder } from "@/components/settings/keyboard-shortcuts-card"; +import { detectVoiceCapabilities, type VoiceCapabilities } from "@/lib/voice/capabilities"; +import type { VoiceModeState } from "@/lib/state/slices/settings/types"; +import type { KeyboardShortcut } from "@/lib/keyboard/constants"; +import { + CONFIGURABLE_SHORTCUTS, + getShortcut, + type StoredShortcutOverrides, +} from "@/lib/keyboard/shortcut-overrides"; +import type { + VoiceInputActivationMode, + VoiceInputEngine, + VoiceModeSettings as VoiceModeWire, + WhisperWebModelSize, +} from "@/lib/types/http-voice"; + +// Single source of truth for the language options. Web Speech reads `lang`, +// Whisper engines treat it as a hint. "auto" defers to the browser locale. +const LANGUAGE_OPTIONS: Array<{ value: string; label: string }> = [ + { value: "auto", label: "Auto-detect (browser language)" }, + { value: "en-US", label: "English (United States)" }, + { value: "en-GB", label: "English (United Kingdom)" }, + { value: "es-ES", label: "Spanish (Spain)" }, + { value: "es-MX", label: "Spanish (Mexico)" }, + { value: "pt-PT", label: "Portuguese (Portugal)" }, + { value: "pt-BR", label: "Portuguese (Brazil)" }, + { value: "fr-FR", label: "French" }, + { value: "de-DE", label: "German" }, + { value: "it-IT", label: "Italian" }, + { value: "ja-JP", label: "Japanese" }, + { value: "zh-CN", label: "Chinese (Simplified)" }, +]; + +const WHISPER_MODELS: Array<{ + value: WhisperWebModelSize; + label: string; + size: string; + hint: string; +}> = [ + { value: "tiny", label: "Tiny", size: "~40 MB", hint: "Fastest, lower accuracy" }, + { value: "base", label: "Base", size: "~75 MB", hint: "Balanced default" }, + { value: "small", label: "Small", size: "~240 MB", hint: "Best accuracy, slower load" }, +]; + +function toWire(state: VoiceModeState): VoiceModeWire { + return { + enabled: state.enabled, + engine: state.engine, + language: state.language, + mode: state.mode, + auto_send: state.autoSend, + whisper_web_model: state.whisperWebModel, + }; +} + +// ── Save hook ──────────────────────────────────────────────────────────── + +function useVoiceModeSaver() { + const userSettings = useAppStore((s) => s.userSettings); + const setUserSettings = useAppStore((s) => s.setUserSettings); + const { toast } = useToast(); + const [saving, setSaving] = useState(false); + + const save = useCallback( + async (patch: Partial) => { + const previous = userSettings.voiceMode; + const next = { ...previous, ...patch }; + setUserSettings({ ...userSettings, voiceMode: next }); + setSaving(true); + try { + await updateUserSettings({ voice_mode: toWire(next) }); + } catch { + setUserSettings({ ...userSettings, voiceMode: previous }); + toast({ title: "Failed to save Voice Mode setting", variant: "error" }); + } finally { + setSaving(false); + } + }, + [userSettings, setUserSettings, toast], + ); + + return { save, saving }; +} + +// ── Engine card ────────────────────────────────────────────────────────── + +type EngineOption = { + value: VoiceInputEngine; + label: string; + description: string; + badge?: string; + disabled?: boolean; +}; + +function buildEngineOptions(caps: VoiceCapabilities): EngineOption[] { + return [ + { + value: "auto", + label: "Automatic", + description: "Use the best engine available in this browser.", + }, + { + value: "webSpeech", + label: "Web Speech (in-browser)", + description: caps.webSpeech + ? "Free, instant, uses your browser's built-in speech recognition." + : "Not supported in this browser.", + disabled: !caps.webSpeech, + }, + { + value: "whisperWeb", + label: "Whisper Web (private, in-browser)", + description: caps.whisperWeb + ? "Runs OpenAI Whisper entirely on this device. First use downloads the model (40–240 MB)." + : "Not supported in this browser.", + badge: "Local", + disabled: !caps.whisperWeb, + }, + { + value: "whisperServer", + label: "Whisper Server (OpenAI)", + description: caps.audioCapture + ? "Sends audio to the backend, which forwards it to OpenAI's Whisper API. Requires a configured API key on the server." + : "Not supported in this browser.", + badge: "Server", + disabled: !caps.audioCapture, + }, + ]; +} + +function EngineCard({ caps }: { caps: VoiceCapabilities }) { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + const options = useMemo(() => buildEngineOptions(caps), [caps]); + + return ( + + + Transcription Engine + + + save({ engine: v as VoiceInputEngine })} + disabled={saving} + className="space-y-3" + > + {options.map((opt) => ( + + ))} + + + + ); +} + +// ── Behavior card (language + mode + auto-send) ────────────────────────── + +function LanguageRow() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( +
+ + +

+ Recognition quality drops sharply when the language doesn't match what you're + speaking. +

+
+ ); +} + +function ModeRow() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( +
+ + save({ mode: v as VoiceInputActivationMode })} + disabled={saving} + className="flex gap-4" + > + + + +
+ ); +} + +function AutoSendRow() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( +
+
+ +

+ Submit the message as soon as the transcript is inserted. +

+
+ save({ autoSend: checked })} + disabled={saving} + /> +
+ ); +} + +function BehaviorCard() { + return ( + + + Behavior + + + + + + + + ); +} + +// ── Whisper Web model card ─────────────────────────────────────────────── + +function WhisperModelCard() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + + return ( + + + Whisper Web Model + + + save({ whisperWebModel: v as WhisperWebModelSize })} + disabled={saving} + className="space-y-2" + > + {WHISPER_MODELS.map((m) => ( + + ))} + +

+ The model downloads on first use and is cached in your browser. Switching models triggers + another download next time you record. +

+
+
+ ); +} + +// ── Enable card (top-level on/off) ─────────────────────────────────────── + +function EnableCard() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( + + + Enable Voice Input + + +
+
+ +

+ When off, the voice button is hidden entirely and no voice-related code runs. Settings + below are preserved and re-applied when you turn it back on. +

+
+ save({ enabled: checked })} + disabled={saving} + /> +
+
+
+ ); +} + +// ── Availability banner ────────────────────────────────────────────────── + +function AvailabilityBanner({ caps }: { caps: VoiceCapabilities }) { + if (caps.webSpeech || caps.whisperWeb || caps.audioCapture) return null; + // Secure-context requirement is the most common reason capability detection + // returns all-false on mobile (when reaching the dev server over LAN HTTP). + // Spell it out so the user doesn't have to guess. + const insecure = typeof window !== "undefined" && !window.isSecureContext; + return ( +
+ +
+

Voice input is unavailable in this browser.

+

+ {insecure + ? "Microphone APIs require HTTPS or localhost. You appear to be on an insecure HTTP origin — load this page over HTTPS (or http://localhost) to enable voice input." + : "Your browser doesn't expose either the Web Speech API or MediaRecorder. Try Chrome, Edge, or Safari 14.5+."} +

+
+
+ ); +} + +// ── Voice keyboard shortcut card ───────────────────────────────────────── + +function useShortcutSaver() { + const userSettings = useAppStore((s) => s.userSettings); + const setUserSettings = useAppStore((s) => s.setUserSettings); + const { toast } = useToast(); + return useCallback( + (next: StoredShortcutOverrides) => { + const previous = userSettings.keyboardShortcuts; + setUserSettings({ ...userSettings, keyboardShortcuts: next }); + updateUserSettings({ keyboard_shortcuts: next }).catch(() => { + setUserSettings({ ...userSettings, keyboardShortcuts: previous }); + toast({ title: "Failed to save shortcut", variant: "error" }); + }); + }, + [userSettings, setUserSettings, toast], + ); +} + +function VoiceShortcutCard() { + const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts); + const persist = useShortcutSaver(); + const current = getShortcut("VOICE_INPUT_TOGGLE", overrides); + + const handleChange = useCallback( + (_id: string, shortcut: KeyboardShortcut) => + persist({ ...overrides, VOICE_INPUT_TOGGLE: shortcut }), + [overrides, persist], + ); + const handleReset = useCallback(() => { + const next = { ...overrides }; + delete next.VOICE_INPUT_TOGGLE; + persist(next); + }, [overrides, persist]); + + return ( + + + + {CONFIGURABLE_SHORTCUTS.VOICE_INPUT_TOGGLE.label} Shortcut + + + + +

+ Click the shortcut to record a new key combination. All keyboard shortcuts can also be + edited in General Settings. +

+
+
+ ); +} + +// ── Page ───────────────────────────────────────────────────────────────── + +export function VoiceModeSettings() { + const caps = useMemo(() => detectVoiceCapabilities(), []); + const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled); + return ( + } + title="Voice Mode" + description="Configure how voice input works on the chat composer." + > +
+ + {/* When voice is disabled, keep showing the secondary cards but dim + them — preserves the visible configuration without implying it has + any effect right now. */} +
+
+ + + + + +
+
+
+
+ ); +} diff --git a/apps/web/components/task/chat/chat-input-body.tsx b/apps/web/components/task/chat/chat-input-body.tsx index aad579f9d..1141c038a 100644 --- a/apps/web/components/task/chat/chat-input-body.tsx +++ b/apps/web/components/task/chat/chat-input-body.tsx @@ -52,6 +52,10 @@ export type ChatInputEditorAreaProps = { onEnhancePrompt?: () => void; isEnhancingPrompt?: boolean; isUtilityConfigured?: boolean; + /** Inserts a voice transcript into the editor at the current cursor. */ + onVoiceTranscript?: (text: string) => void; + /** Submit the message after a voice transcript is inserted (when auto-send is on). */ + onVoiceAutoSend?: () => void; }; function EditorWithTooltip({ @@ -123,6 +127,7 @@ export function ChatInputEditorArea(p: ChatInputEditorAreaProps) { const { isSending, onCancel, contextCount, contextPopoverOpen, setContextPopoverOpen } = p; const { contextFiles, onImplementPlan, onEnhancePrompt, isEnhancingPrompt } = p; const { isUtilityConfigured, hideSessionsDropdown, minimalToolbar, hidePlanMode } = p; + const { onVoiceTranscript, onVoiceAutoSend } = p; // Exclude auto-added plan context from the count — it's always present in plan mode // and shouldn't by itself enable the send button. const userContextCount = planContextEnabled ? Math.max(0, contextCount - 1) : contextCount; @@ -186,6 +191,8 @@ export function ChatInputEditorArea(p: ChatInputEditorAreaProps) { isEnhancingPrompt={isEnhancingPrompt} isUtilityConfigured={isUtilityConfigured} onAttachFiles={handleAttachFiles} + onVoiceTranscript={onVoiceTranscript} + onVoiceAutoSend={onVoiceAutoSend} hideSessionsDropdown={hideSessionsDropdown} minimalToolbar={minimalToolbar} hidePlanMode={hidePlanMode} diff --git a/apps/web/components/task/chat/chat-input-container.tsx b/apps/web/components/task/chat/chat-input-container.tsx index 5975e4550..22ffd9cd9 100644 --- a/apps/web/components/task/chat/chat-input-container.tsx +++ b/apps/web/components/task/chat/chat-input-container.tsx @@ -250,6 +250,8 @@ type EnhancePromptExtras = { onEnhancePrompt?: () => void; isEnhancingPrompt?: boolean; isUtilityConfigured?: boolean; + onVoiceTranscript?: (text: string) => void; + onVoiceAutoSend?: () => void; }; function buildEditorAreaProps( @@ -295,6 +297,8 @@ function buildEditorAreaProps( onEnhancePrompt: extras.onEnhancePrompt, isEnhancingPrompt: extras.isEnhancingPrompt, isUtilityConfigured: extras.isUtilityConfigured, + onVoiceTranscript: extras.onVoiceTranscript, + onVoiceAutoSend: extras.onVoiceAutoSend, hideSessionsDropdown: p.hideSessionsDropdown, minimalToolbar: p.minimalToolbar, hidePlanMode: p.hidePlanMode, @@ -359,6 +363,34 @@ export const ChatInputContainer = forwardRef { + const editor = s.inputRef.current; + if (!editor) return; + const trimmed = text.trim(); + if (!trimmed) return; + const cursor = editor.getSelectionStart(); + const current = editor.getValue(); + // Prepend a space when inserting after existing non-whitespace content + // so transcripts flow naturally without running into the previous word. + const charBefore = cursor > 0 ? current.charAt(cursor - 1) : ""; + const needsLeadingSpace = charBefore !== "" && !/\s/.test(charBefore); + const insert = needsLeadingSpace ? ` ${trimmed}` : trimmed; + editor.insertText(insert, cursor, cursor); + }, + [s.inputRef], + ); + + // Auto-send fires the same submit path as the regular send button. Guards + // against firing while the input is in a disabled state (e.g. the agent + // is currently booting) — the button is hidden in that case anyway, but + // defence-in-depth so a stale keyboard shortcut press doesn't trigger. + const { submitDisabled: voiceSubmitDisabled, handleSubmitWithReset: voiceSubmit } = s; + const handleVoiceAutoSend = useCallback(() => { + if (voiceSubmitDisabled) return; + voiceSubmit(); + }, [voiceSubmitDisabled, voiceSubmit]); + if (p.isFailed || executorUnavailable) { return ( ); diff --git a/apps/web/components/task/chat/chat-input-toolbar.tsx b/apps/web/components/task/chat/chat-input-toolbar.tsx index 2447211d5..d8cb9ca10 100644 --- a/apps/web/components/task/chat/chat-input-toolbar.tsx +++ b/apps/web/components/task/chat/chat-input-toolbar.tsx @@ -31,6 +31,7 @@ import { ModeSelector } from "@/components/task/mode-selector"; import { ContextPopover } from "./context-popover"; import { ResetContextButton } from "./reset-context-button"; import { ImplementPlanButton } from "./implement-plan-button"; +import { VoiceInputButton } from "./voice-input-button"; import type { ContextFile } from "@/lib/state/context-files-store"; export type ChatInputToolbarProps = { @@ -67,6 +68,12 @@ export type ChatInputToolbarProps = { isUtilityConfigured?: boolean; /** Callback to open file picker for attaching files */ onAttachFiles?: () => void; + /** Callback to insert a transcribed voice utterance into the editor. When + * omitted, the voice button is hidden — keeps quick-chat / read-only + * variants free of a button they can't wire. */ + onVoiceTranscript?: (text: string) => void; + /** Optional auto-send hook fired after a voice transcript is inserted. */ + onVoiceAutoSend?: () => void; /** Hide the sessions dropdown (for quick chat) */ hideSessionsDropdown?: boolean; /** When true, only render the submit/cancel button — no other controls */ @@ -308,6 +315,8 @@ function ToolbarRightSection({ onCancel, onSubmit, submitShortcut, + onVoiceTranscript, + onVoiceAutoSend, }: { showCollapsed: boolean; rightItems: ToolbarItemConfig[]; @@ -322,6 +331,8 @@ function ToolbarRightSection({ onCancel: () => void; onSubmit: () => void; submitShortcut: (typeof SHORTCUTS)[keyof typeof SHORTCUTS]; + onVoiceTranscript?: (text: string) => void; + onVoiceAutoSend?: () => void; }) { return (
@@ -330,7 +341,14 @@ function ToolbarRightSection({ {planModeEnabled && !isAgentBusy && onImplementPlan && ( )} -
+
+ {onVoiceTranscript && ( + + )}
); diff --git a/apps/web/components/task/chat/voice-input-button.tsx b/apps/web/components/task/chat/voice-input-button.tsx new file mode 100644 index 000000000..978bfebda --- /dev/null +++ b/apps/web/components/task/chat/voice-input-button.tsx @@ -0,0 +1,265 @@ +"use client"; + +import { useCallback, useEffect, useRef } from "react"; +import { IconLoader2, IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react"; + +import { Button } from "@kandev/ui/button"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@kandev/ui/tooltip"; +import { cn } from "@/lib/utils"; +import { + useVoiceInput, + type VoiceError, + type VoiceInputState, + type VoiceModelLoadState, +} from "@/hooks/use-voice-input"; +import { useAppStore } from "@/components/state-provider"; +import { useKeyboardShortcut } from "@/hooks/use-keyboard-shortcut"; +import { useToast } from "@/components/toast-provider"; +import { getShortcut } from "@/lib/keyboard/shortcut-overrides"; + +type VoiceInputButtonProps = { + /** Inserts the recognized transcript at the current cursor position. */ + onTranscript: (text: string) => void; + /** Called after a non-empty transcript was inserted, when auto-send is enabled. */ + onAutoSend?: () => void; + /** Disable while the chat input itself is disabled (sending / starting / failed). */ + disabled?: boolean; +}; + +const TOOLTIP_BY_STATE: Record = { + idle: "Voice input", + requesting: "Requesting microphone…", + recording: "Stop recording", + processing: "Transcribing…", +}; + +const ARIA_BY_STATE: Record = { + idle: "Start voice input", + requesting: "Requesting microphone permission", + recording: "Stop voice input", + processing: "Transcribing voice input", +}; + +function ButtonIcon({ + state, + modelLoad, +}: { + state: VoiceInputState; + modelLoad: VoiceModelLoadState; +}) { + if (state === "processing" || state === "requesting" || modelLoad.state === "loading") { + return ; + } + if (state === "recording") { + return ; + } + return ; +} + +function toastForError(toast: ReturnType["toast"], err: VoiceError) { + if (err.code === "no-speech") { + toast({ title: err.message }); + return; + } + toast({ title: err.message, variant: "error" }); +} + +// ── Activation handlers ────────────────────────────────────────────────── + +function buildHoldHandlers(start: () => Promise, stop: () => Promise) { + return { + onPointerDown: (e: React.PointerEvent) => { + e.preventDefault(); + void start(); + }, + onPointerUp: (e: React.PointerEvent) => { + e.preventDefault(); + void stop(); + }, + onPointerLeave: () => void stop(), + onPointerCancel: () => void stop(), + }; +} + +function buildToggleHandler( + state: VoiceInputState, + start: () => Promise, + stop: () => Promise, +) { + return () => { + if (state === "idle") void start(); + else if (state === "recording") void stop(); + }; +} + +// ── Hook composition ───────────────────────────────────────────────────── + +function useAutoSendOnTranscript( + baseOnTranscript: (text: string) => void, + onAutoSend: (() => void) | undefined, + enabled: boolean, +) { + // Wrap onTranscript so we can defer auto-send until after the transcript + // has been inserted. requestAnimationFrame keeps a clean separation between + // the editor update and the submit handler, so the editor's onChange has + // already flushed when submit reads from it. + return useCallback( + (text: string) => { + baseOnTranscript(text); + if (enabled && onAutoSend) requestAnimationFrame(onAutoSend); + }, + [baseOnTranscript, onAutoSend, enabled], + ); +} + +function useVoiceShortcut( + enabled: boolean, + state: VoiceInputState, + start: () => Promise, + stop: () => Promise, +) { + const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts); + const shortcut = getShortcut("VOICE_INPUT_TOGGLE", overrides); + const stateRef = useRef(state); + useEffect(() => { + stateRef.current = state; + }, [state]); + const handler = useCallback(() => { + if (stateRef.current === "idle") void start(); + else if (stateRef.current === "recording") void stop(); + }, [start, stop]); + useKeyboardShortcut(shortcut, handler, { enabled }); +} + +// ── Unsupported fallback ──────────────────────────────────────────────── + +function buildUnsupportedReason(): string { + if (typeof window === "undefined") return "Voice input is unavailable here."; + if (!window.isSecureContext) { + return "Voice input needs HTTPS. Open this site over https:// (or http://localhost) — most mobile browsers block microphone APIs on insecure origins."; + } + return "Voice input isn't supported in this browser. Try Chrome, Edge, or Safari 14.5+."; +} + +function UnsupportedVoiceButton({ disabled }: { disabled?: boolean }) { + const { toast } = useToast(); + const handleClick = () => { + toast({ + title: "Voice input unavailable", + description: buildUnsupportedReason(), + variant: "error", + }); + }; + return ( + + + + + Voice input unavailable — tap for details + + ); +} + +// ── Component ──────────────────────────────────────────────────────────── + +export function VoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) { + const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled); + // Render nothing — including no hook subscriptions — when the user has + // disabled the feature in settings. Distinct from `!supported` (browser + // limitation) which shows a tappable greyed icon. Done as a sub-component + // so the unconditional hook count stays the same in the active path. + if (!enabled) return null; + return ( + + ); +} + +function EnabledVoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) { + const { toast } = useToast(); + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const handleError = useCallback((err: VoiceError) => toastForError(toast, err), [toast]); + const wrappedTranscript = useAutoSendOnTranscript(onTranscript, onAutoSend, voiceMode.autoSend); + + const { supported, state, modelLoad, start, stop, cancel } = useVoiceInput({ + onTranscript: wrappedTranscript, + onError: handleError, + }); + + // If the chat input gets disabled mid-recording, cancel rather than leave + // the mic indicator on. Hold-mode pointerup may not fire if focus moves. + useEffect(() => { + if (disabled && (state === "recording" || state === "requesting")) cancel(); + }, [disabled, state, cancel]); + + useVoiceShortcut(supported && !disabled, state, start, stop); + + // Always render the button — even when unsupported — so users can see it on + // mobile and tap to learn why voice input isn't working (usually a missing + // secure context, e.g. when reaching the dev server over LAN HTTP). Hiding + // the button silently left mobile users with no discoverable feedback. + if (!supported) return ; + + const isRecording = state === "recording"; + const isBusy = state === "requesting" || state === "processing" || modelLoad.state === "loading"; + const holdMode = voiceMode.mode === "hold"; + + const pointerHandlers = holdMode ? buildHoldHandlers(start, stop) : {}; + const onClick = holdMode ? undefined : buildToggleHandler(state, start, stop); + + // Styled to mirror SubmitButton (h-7 w-7 rounded-full primary fill) so the + // two prominent input actions read as a pair on the right of the toolbar. + // Recording flips to a destructive fill with a pulsing ring so the active + // state is unmistakable even on mobile. + return ( + + + + + + {modelLoad.state === "loading" + ? `Loading model… ${Math.round(modelLoad.progress * 100)}%` + : `${TOOLTIP_BY_STATE[state]}${holdMode && state === "idle" ? " (hold)" : ""}`} + + + ); +} diff --git a/apps/web/hooks/use-user-display-settings.ts b/apps/web/hooks/use-user-display-settings.ts index 250e2bac2..c06dfb5c2 100644 --- a/apps/web/hooks/use-user-display-settings.ts +++ b/apps/web/hooks/use-user-display-settings.ts @@ -6,7 +6,10 @@ import { useAppStore } from "@/components/state-provider"; import { useRepositories } from "@/hooks/domains/workspace/use-repositories"; import { mapUserSettingsResponse } from "@/lib/ssr/user-settings"; import { repositoryId, type Repository } from "@/lib/types/http"; -import type { UserSettingsState } from "@/lib/state/slices/settings/types"; +import { + DEFAULT_VOICE_MODE_STATE, + type UserSettingsState, +} from "@/lib/state/slices/settings/types"; type DisplaySettings = UserSettingsState; @@ -36,7 +39,15 @@ function carryForwardTerminalSettings(current: DisplaySettings) { }; } -function carryForwardSettings(current: DisplaySettings) { +function carryForwardLspSettings(current: DisplaySettings) { + return { + lspAutoStartLanguages: current.lspAutoStartLanguages ?? [], + lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [], + lspServerConfigs: current.lspServerConfigs ?? {}, + }; +} + +function carryForwardCoreSettings(current: DisplaySettings) { return { shellOptions: current.shellOptions ?? [], defaultEditorId: current.defaultEditorId ?? null, @@ -44,14 +55,19 @@ function carryForwardSettings(current: DisplaySettings) { reviewAutoMarkOnScroll: current.reviewAutoMarkOnScroll ?? true, showReleaseNotification: current.showReleaseNotification ?? true, releaseNotesLastSeenVersion: current.releaseNotesLastSeenVersion ?? null, - lspAutoStartLanguages: current.lspAutoStartLanguages ?? [], - lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [], - lspServerConfigs: current.lspServerConfigs ?? {}, savedLayouts: current.savedLayouts ?? [], sidebarViews: current.sidebarViews ?? [], defaultUtilityAgentId: current.defaultUtilityAgentId ?? null, keyboardShortcuts: current.keyboardShortcuts ?? {}, changesPanelLayout: current.changesPanelLayout ?? "flat", + voiceMode: current.voiceMode ?? { ...DEFAULT_VOICE_MODE_STATE }, + }; +} + +function carryForwardSettings(current: DisplaySettings) { + return { + ...carryForwardCoreSettings(current), + ...carryForwardLspSettings(current), ...carryForwardTerminalSettings(current), }; } diff --git a/apps/web/hooks/use-voice-input.test.ts b/apps/web/hooks/use-voice-input.test.ts new file mode 100644 index 000000000..3137cceba --- /dev/null +++ b/apps/web/hooks/use-voice-input.test.ts @@ -0,0 +1,199 @@ +import { act, renderHook, waitFor } from "@testing-library/react"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +// ── Hoisted mocks (defined before the modules they replace are evaluated) ── + +const voicePrefs = vi.hoisted(() => ({ + value: { + engine: "auto" as "auto" | "webSpeech" | "whisperWeb" | "whisperServer", + language: "auto", + mode: "toggle" as "toggle" | "hold", + autoSend: false, + whisperWebModel: "base" as "tiny" | "base" | "small", + }, +})); + +vi.mock("@/components/state-provider", () => ({ + useAppStore: ( + selector: (state: { userSettings: { voiceMode: typeof voicePrefs.value } }) => unknown, + ) => selector({ userSettings: { voiceMode: voicePrefs.value } }), +})); + +const transcribeAudio = vi.hoisted(() => vi.fn()); +vi.mock("@/lib/api/domains/voice-api", () => ({ transcribeAudio })); + +// ── Mock SpeechRecognition ───────────────────────────────────────────── + +type SpeechHandle = { + start: () => void; + stop: () => void; + abort: () => void; + onresult: ((ev: { resultIndex: number; results: unknown }) => void) | null; + onerror: ((ev: { error: string }) => void) | null; + onend: (() => void) | null; + continuous: boolean; + interimResults: boolean; + maxAlternatives: number; + lang: string; + startCalls: number; + stopCalls: number; + abortCalls: number; +}; + +let recognitionInstance: SpeechHandle | null = null; + +// Factory pattern instead of `class` so we can avoid aliasing `this` in the +// constructor (the lint rule disallows it) while still satisfying the +// `new ()` shape that useVoiceInput's `new Ctor()` calls. +function FakeSpeechRecognition() { + const handle: SpeechHandle = { + continuous: false, + interimResults: false, + maxAlternatives: 1, + lang: "", + onresult: null, + onerror: null, + onend: null, + startCalls: 0, + stopCalls: 0, + abortCalls: 0, + start() { + handle.startCalls += 1; + }, + stop() { + handle.stopCalls += 1; + }, + abort() { + handle.abortCalls += 1; + }, + }; + recognitionInstance = handle; + return handle; +} + +// Import after mocks so the module under test sees the mocked store. +import { useVoiceInput } from "./use-voice-input"; + +// ── Tests ─────────────────────────────────────────────────────────────── + +beforeEach(() => { + voicePrefs.value = { + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }; + recognitionInstance = null; + transcribeAudio.mockReset(); + (window as unknown as { SpeechRecognition: unknown }).SpeechRecognition = + FakeSpeechRecognition as unknown as new () => SpeechHandle; + // MediaRecorder/getUserMedia not used in the auto→webSpeech path, but provide + // a stub so capability detection sees audioCapture available too. + (window as unknown as { MediaRecorder: { isTypeSupported: () => boolean } }).MediaRecorder = { + isTypeSupported: () => true, + }; + Object.defineProperty(global.navigator, "mediaDevices", { + value: { getUserMedia: vi.fn() }, + configurable: true, + }); +}); + +afterEach(() => { + delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition; + delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition; + delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; +}); + +describe("useVoiceInput — Web Speech engine", () => { + it("reports supported and resolves engine = webSpeech under the default auto preference", () => { + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + expect(result.current.supported).toBe(true); + expect(result.current.engine).toBe("webSpeech"); + }); + + it("transitions idle → recording on start() and emits the final transcript on stop()", async () => { + const onTranscript = vi.fn(); + const { result } = renderHook(() => useVoiceInput({ onTranscript })); + + await act(async () => { + await result.current.start(); + }); + expect(result.current.state).toBe("recording"); + expect(recognitionInstance?.startCalls).toBe(1); + + act(() => { + recognitionInstance?.onresult?.({ + resultIndex: 0, + results: { + length: 1, + 0: { isFinal: true, length: 1, 0: { transcript: "hello world" } }, + } as unknown, + }); + recognitionInstance?.onend?.(); + }); + + await waitFor(() => { + expect(onTranscript).toHaveBeenCalledWith("hello world"); + expect(result.current.state).toBe("idle"); + }); + }); + + it("maps a not-allowed permission error to a permission-denied VoiceError", async () => { + const onError = vi.fn(); + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), onError })); + + await act(async () => { + await result.current.start(); + }); + act(() => { + recognitionInstance?.onerror?.({ error: "not-allowed" }); + }); + + expect(onError).toHaveBeenCalledWith({ + code: "permission-denied", + message: "Microphone permission denied.", + }); + expect(result.current.state).toBe("idle"); + }); +}); + +describe("useVoiceInput — capability gating", () => { + it("returns supported=false and engine=null when no engine is usable", () => { + delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition; + delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; + Object.defineProperty(global.navigator, "mediaDevices", { value: {}, configurable: true }); + + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + expect(result.current.supported).toBe(false); + expect(result.current.engine).toBeNull(); + }); + + it("disables the hook entirely when enabled=false", () => { + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), enabled: false })); + expect(result.current.supported).toBe(false); + expect(result.current.engine).toBeNull(); + }); +}); + +describe("useVoiceInput — language preference", () => { + it("passes the pinned BCP-47 language to SpeechRecognition.lang", async () => { + voicePrefs.value = { ...voicePrefs.value, language: "pt-PT" }; + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + + await act(async () => { + await result.current.start(); + }); + expect(recognitionInstance?.lang).toBe("pt-PT"); + }); + + it("falls back to navigator.language when 'auto'", async () => { + voicePrefs.value = { ...voicePrefs.value, language: "auto" }; + Object.defineProperty(global.navigator, "language", { value: "fr-FR", configurable: true }); + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + await act(async () => { + await result.current.start(); + }); + expect(recognitionInstance?.lang).toBe("fr-FR"); + }); +}); diff --git a/apps/web/hooks/use-voice-input.ts b/apps/web/hooks/use-voice-input.ts new file mode 100644 index 000000000..7f11b79cf --- /dev/null +++ b/apps/web/hooks/use-voice-input.ts @@ -0,0 +1,471 @@ +"use client"; + +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; +import { ApiError } from "@/lib/api/client"; +import { transcribeAudio } from "@/lib/api/domains/voice-api"; +import { detectVoiceCapabilities, resolveActiveEngine } from "@/lib/voice/capabilities"; +import { WhisperWebClient, type WhisperWebProgress } from "@/lib/voice/whisper-web-client"; +import { useAppStore } from "@/components/state-provider"; +import type { VoiceInputEngine, WhisperWebModelSize } from "@/lib/types/http-voice"; + +// ── Public types ──────────────────────────────────────────────────────── + +export type VoiceInputState = "idle" | "requesting" | "recording" | "processing"; + +export type VoiceErrorCode = + | "permission-denied" + | "no-speech" + | "not-configured" + | "network" + | "unsupported" + | "model-load" + | "unknown"; + +export type VoiceError = { code: VoiceErrorCode; message: string }; + +export type VoiceModelLoadState = { + state: "idle" | "loading" | "ready" | "error"; + progress: number; +}; + +export type UseVoiceInputOptions = { + onTranscript: (text: string) => void; + onError?: (error: VoiceError) => void; + /** Set false to disable the hook entirely (e.g. for read-only contexts). */ + enabled?: boolean; +}; + +export type UseVoiceInputResult = { + supported: boolean; + engine: Exclude | null; + state: VoiceInputState; + error: VoiceError | null; + modelLoad: VoiceModelLoadState; + start: () => Promise; + stop: () => Promise; + cancel: () => void; +}; + +// ── Web Speech typings (DOM lib doesn't ship them) ───────────────────── + +type SpeechAlt = { transcript: string }; +type SpeechResult = { isFinal: boolean; 0: SpeechAlt; length: number }; +type SpeechResultList = { length: number; [index: number]: SpeechResult }; +type SpeechResultEvent = { resultIndex: number; results: SpeechResultList }; +type SpeechErrorEvent = { error: string; message?: string }; +type SpeechRecognitionInstance = { + lang: string; + continuous: boolean; + interimResults: boolean; + maxAlternatives: number; + start: () => void; + stop: () => void; + abort: () => void; + onresult: ((ev: SpeechResultEvent) => void) | null; + onerror: ((ev: SpeechErrorEvent) => void) | null; + onend: (() => void) | null; +}; + +type SpeechCtor = new () => SpeechRecognitionInstance; + +function createSpeechRecognition(): SpeechRecognitionInstance | null { + if (typeof window === "undefined") return null; + const w = window as Window & { + SpeechRecognition?: SpeechCtor; + webkitSpeechRecognition?: SpeechCtor; + }; + const Ctor = w.SpeechRecognition ?? w.webkitSpeechRecognition; + return Ctor ? new Ctor() : null; +} + +// ── Error mappers ─────────────────────────────────────────────────────── + +function mapSpeechError(code: string): VoiceError { + if (code === "not-allowed" || code === "service-not-allowed") { + return { code: "permission-denied", message: "Microphone permission denied." }; + } + if (code === "no-speech") return { code: "no-speech", message: "No speech detected. Try again." }; + if (code === "network") { + return { code: "network", message: "Voice recognition lost network connection." }; + } + if (code === "audio-capture") return { code: "unknown", message: "No microphone was found." }; + return { code: "unknown", message: `Voice recognition error: ${code}` }; +} + +function mapMicError(err: unknown): VoiceError { + if (err && typeof err === "object" && "name" in err) { + const name = (err as { name: string }).name; + if (name === "NotAllowedError" || name === "SecurityError") { + return { code: "permission-denied", message: "Microphone permission denied." }; + } + if (name === "NotFoundError" || name === "OverconstrainedError") { + return { code: "unknown", message: "No microphone was found." }; + } + } + return { code: "unknown", message: "Failed to start recording." }; +} + +function mapTranscribeError(err: unknown): VoiceError { + if (err instanceof ApiError && err.status === 503) { + return { + code: "not-configured", + message: + "Server-side transcription isn't configured. Pick Web Speech or Whisper Web in Voice Mode settings.", + }; + } + return { code: "network", message: "Transcription failed. Please try again." }; +} + +function whisperErrorMessage(err: unknown): VoiceError { + const message = err instanceof Error ? err.message : "Whisper Web failed to transcribe."; + return { code: "model-load", message }; +} + +function resolveLang(preference: string): string { + if (preference && preference !== "auto") return preference; + return typeof navigator !== "undefined" ? navigator.language : "en-US"; +} + +function resolveWhisperLang(preference: string): string | undefined { + if (preference && preference !== "auto") return preference; + return undefined; +} + +// ── MediaRecorder capture primitive ───────────────────────────────────── + +function pickRecorderMime(): { mime: string; ext: string } { + if (typeof window === "undefined" || typeof window.MediaRecorder === "undefined") { + return { mime: "", ext: "webm" }; + } + const candidates: Array<{ mime: string; ext: string }> = [ + { mime: "audio/webm;codecs=opus", ext: "webm" }, + { mime: "audio/webm", ext: "webm" }, + { mime: "audio/mp4", ext: "m4a" }, + { mime: "audio/ogg;codecs=opus", ext: "ogg" }, + { mime: "audio/wav", ext: "wav" }, + ]; + for (const c of candidates) { + if (window.MediaRecorder.isTypeSupported(c.mime)) return c; + } + return { mime: "", ext: "webm" }; +} + +type CaptureHandle = { + stream: MediaStream; + recorder: MediaRecorder; + chunks: Blob[]; + mime: string; + ext: string; +}; + +async function startCapture(): Promise { + const { mime, ext } = pickRecorderMime(); + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const recorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined); + const chunks: Blob[] = []; + recorder.addEventListener("dataavailable", (e) => { + if (e.data && e.data.size > 0) chunks.push(e.data); + }); + recorder.start(); + return { stream, recorder, chunks, mime, ext }; +} + +function teardownCapture(handle: CaptureHandle | null) { + if (!handle) return; + for (const t of handle.stream.getTracks()) t.stop(); +} + +function stopCapture(handle: CaptureHandle): Promise { + return new Promise((resolve) => { + if (handle.recorder.state === "inactive") { + teardownCapture(handle); + resolve(null); + return; + } + handle.recorder.addEventListener( + "stop", + () => { + const type = handle.recorder.mimeType || handle.mime || "audio/webm"; + const blob = handle.chunks.length > 0 ? new Blob(handle.chunks, { type }) : null; + teardownCapture(handle); + resolve(blob); + }, + { once: true }, + ); + handle.recorder.stop(); + }); +} + +// ── Driver refs ───────────────────────────────────────────────────────── + +type ActiveDriverRef = + | { kind: "webSpeech"; recognition: SpeechRecognitionInstance } + | { kind: "capture"; handle: CaptureHandle; engine: "whisperWeb" | "whisperServer" } + | null; + +type DriverRefBox = { current: ActiveDriverRef }; +type WhisperRefBox = { current: WhisperWebClient | null }; + +function abortDriver(ref: DriverRefBox) { + const driver = ref.current; + if (!driver) return; + if (driver.kind === "webSpeech") driver.recognition.abort(); + else teardownCapture(driver.handle); + ref.current = null; +} + +// ── Web Speech driver ─────────────────────────────────────────────────── + +type WebSpeechHandlers = { + setState: (s: VoiceInputState) => void; + driverRef: DriverRefBox; + emitError: (e: VoiceError) => void; + onTranscriptRef: { current: (text: string) => void }; + lang: string; +}; + +function runWebSpeech(h: WebSpeechHandlers): void { + const recognition = createSpeechRecognition(); + if (!recognition) { + h.emitError({ code: "unsupported", message: "Voice recognition is not supported." }); + return; + } + const transcripts: string[] = []; + recognition.continuous = true; + recognition.interimResults = false; + recognition.maxAlternatives = 1; + recognition.lang = h.lang; + recognition.onresult = (ev) => { + for (let i = ev.resultIndex; i < ev.results.length; i++) { + const r = ev.results[i]; + if (r.isFinal && r[0]?.transcript) transcripts.push(r[0].transcript.trim()); + } + }; + recognition.onerror = (ev) => h.emitError(mapSpeechError(ev.error)); + recognition.onend = () => { + h.driverRef.current = null; + h.setState("idle"); + const joined = transcripts.join(" ").trim(); + if (joined) h.onTranscriptRef.current(joined); + }; + try { + recognition.start(); + h.driverRef.current = { kind: "webSpeech", recognition }; + h.setState("recording"); + } catch { + h.emitError({ code: "unknown", message: "Failed to start voice recognition." }); + } +} + +// ── Capture engines (whisperWeb + whisperServer) ─────────────────────── + +type CaptureHandlers = { + setState: (s: VoiceInputState) => void; + emitError: (e: VoiceError) => void; + driverRef: DriverRefBox; +}; + +async function beginCapture( + which: "whisperWeb" | "whisperServer", + h: CaptureHandlers, +): Promise { + h.setState("requesting"); + try { + const handle = await startCapture(); + h.driverRef.current = { kind: "capture", handle, engine: which }; + h.setState("recording"); + } catch (err) { + h.emitError(mapMicError(err)); + } +} + +type FinishCaptureHandlers = { + driverRef: DriverRefBox; + whisperRef: WhisperRefBox; + setState: (s: VoiceInputState) => void; + setModelLoad: (next: VoiceModelLoadState) => void; + emitError: (e: VoiceError) => void; + onTranscriptRef: { current: (text: string) => void }; + whisperModel: WhisperWebModelSize; + language: string; +}; + +async function finishCapture(h: FinishCaptureHandlers): Promise { + const driver = h.driverRef.current; + if (!driver || driver.kind !== "capture") return; + h.setState("processing"); + const blob = await stopCapture(driver.handle); + h.driverRef.current = null; + if (!blob) { + h.setState("idle"); + return; + } + try { + const text = + driver.engine === "whisperServer" + ? await transcribeViaServer(blob, driver.handle.ext) + : await transcribeViaWhisperWeb(blob, h); + if (text) h.onTranscriptRef.current(text); + h.setState("idle"); + } catch (err) { + if (driver.engine === "whisperServer") h.emitError(mapTranscribeError(err)); + else h.emitError(whisperErrorMessage(err)); + } +} + +async function transcribeViaServer(blob: Blob, ext: string): Promise { + const result = await transcribeAudio(blob, `recording.${ext}`); + return result.text.trim(); +} + +async function transcribeViaWhisperWeb(blob: Blob, h: FinishCaptureHandlers): Promise { + const client = await ensureWhisperClient(h); + const text = await client.transcribe(blob, resolveWhisperLang(h.language)); + return text.trim(); +} + +async function ensureWhisperClient(h: FinishCaptureHandlers): Promise { + if (!h.whisperRef.current) { + h.whisperRef.current = new WhisperWebClient({ + onProgress: (p: WhisperWebProgress) => + h.setModelLoad({ state: "loading", progress: p.progress }), + }); + h.setModelLoad({ state: "loading", progress: 0 }); + } + try { + await h.whisperRef.current.init(h.whisperModel); + h.setModelLoad({ state: "ready", progress: 1 }); + } catch (err) { + h.setModelLoad({ state: "error", progress: 0 }); + throw err; + } + return h.whisperRef.current; +} + +// ── Hook helpers ──────────────────────────────────────────────────────── + +function useVoiceModePrefs() { + return useAppStore((s) => s.userSettings.voiceMode); +} + +function useCallbackRefs(opts: UseVoiceInputOptions) { + const onTranscriptRef = useRef(opts.onTranscript); + const onErrorRef = useRef(opts.onError); + useEffect(() => { + onTranscriptRef.current = opts.onTranscript; + onErrorRef.current = opts.onError; + }); + return { onTranscriptRef, onErrorRef }; +} + +// Re-init the whisper client whenever the user switches model size, so we +// don't keep an old in-memory model around when the next start() runs. +function useDisposeWhisperOnModelChange( + whisperRef: WhisperRefBox, + modelSize: string, + reset: () => void, +) { + const previousModelRef = useRef(modelSize); + useEffect(() => { + if (previousModelRef.current === modelSize) return; + previousModelRef.current = modelSize; + whisperRef.current?.dispose(); + whisperRef.current = null; + reset(); + }, [modelSize, whisperRef, reset]); +} + +function useUnmountCleanup(driverRef: DriverRefBox, whisperRef: WhisperRefBox) { + useEffect(() => { + return () => { + abortDriver(driverRef); + whisperRef.current?.dispose(); + whisperRef.current = null; + }; + }, [driverRef, whisperRef]); +} + +// ── Hook ──────────────────────────────────────────────────────────────── + +export function useVoiceInput(opts: UseVoiceInputOptions): UseVoiceInputResult { + const caps = useMemo(() => detectVoiceCapabilities(), []); + const prefs = useVoiceModePrefs(); + const enabled = opts.enabled !== false; + const engine = useMemo( + () => (enabled ? resolveActiveEngine(prefs.engine, caps, true) : null), + [enabled, prefs.engine, caps], + ); + const supported = engine !== null; + + const [state, setState] = useState("idle"); + const [error, setError] = useState(null); + const [modelLoad, setModelLoad] = useState({ + state: "idle", + progress: 0, + }); + + const driverRef = useRef(null); + const whisperRef = useRef(null); + const { onTranscriptRef, onErrorRef } = useCallbackRefs(opts); + + const emitError = useCallback( + (e: VoiceError) => { + setError(e); + setState("idle"); + onErrorRef.current?.(e); + }, + [onErrorRef], + ); + + const resetModelLoad = useCallback(() => setModelLoad({ state: "idle", progress: 0 }), []); + + useUnmountCleanup(driverRef, whisperRef); + useDisposeWhisperOnModelChange(whisperRef, prefs.whisperWebModel, resetModelLoad); + + const start = useCallback(async () => { + if (!supported || !engine) { + emitError({ code: "unsupported", message: "Voice input is not supported in this browser." }); + return; + } + if (state !== "idle") return; + setError(null); + if (engine === "webSpeech") { + runWebSpeech({ + setState, + driverRef, + emitError, + onTranscriptRef, + lang: resolveLang(prefs.language), + }); + return; + } + await beginCapture(engine, { setState, emitError, driverRef }); + }, [supported, engine, state, emitError, prefs.language, onTranscriptRef]); + + const stop = useCallback(async () => { + const driver = driverRef.current; + if (!driver) return; + if (driver.kind === "webSpeech") { + driver.recognition.stop(); + return; + } + await finishCapture({ + driverRef, + whisperRef, + setState, + setModelLoad, + emitError, + onTranscriptRef, + whisperModel: prefs.whisperWebModel, + language: prefs.language, + }); + }, [emitError, prefs.whisperWebModel, prefs.language, onTranscriptRef]); + + const cancel = useCallback(() => { + abortDriver(driverRef); + setState("idle"); + setError(null); + }, []); + + return { supported, engine, state, error, modelLoad, start, stop, cancel }; +} diff --git a/apps/web/lib/api/domains/settings-api.ts b/apps/web/lib/api/domains/settings-api.ts index 343e30efa..ec9b229be 100644 --- a/apps/web/lib/api/domains/settings-api.ts +++ b/apps/web/lib/api/domains/settings-api.ts @@ -21,6 +21,7 @@ import type { UserSettingsResponse, DynamicModelsResponse, } from "@/lib/types/http"; +import type { VoiceModeSettings } from "@/lib/types/http-voice"; // User settings export async function fetchUserSettings(options?: ApiRequestOptions) { @@ -52,6 +53,7 @@ export async function updateUserSettings( terminal_font_family?: string; terminal_font_size?: number; changes_panel_layout?: "flat" | "tree"; + voice_mode?: VoiceModeSettings; }, options?: ApiRequestOptions, ) { diff --git a/apps/web/lib/api/domains/voice-api.test.ts b/apps/web/lib/api/domains/voice-api.test.ts new file mode 100644 index 000000000..d3618cae8 --- /dev/null +++ b/apps/web/lib/api/domains/voice-api.test.ts @@ -0,0 +1,63 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { ApiError } from "../client"; +import { transcribeAudio } from "./voice-api"; + +const originalFetch = global.fetch; + +describe("transcribeAudio", () => { + afterEach(() => { + global.fetch = originalFetch; + }); + + beforeEach(() => { + vi.restoreAllMocks(); + }); + + it("posts multipart/form-data with the audio under the 'audio' field", async () => { + let capturedRequest: { method?: string; bodyText: string } = { bodyText: "" }; + global.fetch = vi.fn(async (_url: RequestInfo | URL, init?: RequestInit) => { + capturedRequest = { + method: init?.method, + bodyText: init?.body instanceof FormData ? "" : String(init?.body), + }; + return new Response(JSON.stringify({ text: "hi" }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }) as unknown as typeof fetch; + + const blob = new Blob([new Uint8Array([1, 2, 3])], { type: "audio/webm" }); + const result = await transcribeAudio(blob, "clip.webm", { + baseUrl: "http://example.test", + }); + + expect(result.text).toBe("hi"); + expect(capturedRequest.method).toBe("POST"); + expect(capturedRequest.bodyText).toBe(""); + }); + + it("throws ApiError(503) when the server reports not-configured", async () => { + global.fetch = vi.fn( + async () => + new Response(JSON.stringify({ error: "voice transcription is not configured" }), { + status: 503, + }), + ) as unknown as typeof fetch; + + const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" }); + await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toMatchObject({ + status: 503, + }); + }); + + it("surfaces non-2xx errors as ApiError instances", async () => { + global.fetch = vi.fn( + async () => new Response("bad", { status: 502, statusText: "Bad Gateway" }), + ) as unknown as typeof fetch; + + const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" }); + await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toBeInstanceOf( + ApiError, + ); + }); +}); diff --git a/apps/web/lib/api/domains/voice-api.ts b/apps/web/lib/api/domains/voice-api.ts new file mode 100644 index 000000000..88e65def5 --- /dev/null +++ b/apps/web/lib/api/domains/voice-api.ts @@ -0,0 +1,49 @@ +import { ApiError, type ApiRequestOptions } from "../client"; +import { getBackendConfig } from "@/lib/config"; + +export type TranscribeResponse = { + text: string; +}; + +/** + * POST audio to the backend Whisper fallback. Returns the transcribed text. + * + * Throws ApiError on non-2xx. Two statuses are meaningful to the caller: + * - 503: server has no API key configured — the hook should treat the + * Whisper fallback as unavailable and surface a clean message. + * - any other non-2xx: transient error — show a generic toast. + */ +export async function transcribeAudio( + blob: Blob, + filename: string, + options?: ApiRequestOptions, +): Promise { + const baseUrl = options?.baseUrl ?? getBackendConfig().apiBaseUrl; + const formData = new FormData(); + formData.append("audio", blob, filename); + + // Do NOT set Content-Type: the browser sets multipart/form-data with the + // correct boundary automatically when given a FormData body. + const response = await fetch(`${baseUrl}/api/v1/transcribe`, { + method: "POST", + body: formData, + ...options?.init, + }); + + if (!response.ok) { + let body: unknown = null; + try { + body = await response.json(); + } catch { + // body remains null + } + let message = `Transcription failed: ${response.status} ${response.statusText}`; + if (body && typeof body === "object" && "error" in body) { + const errVal = (body as { error?: unknown }).error; + if (typeof errVal === "string") message = errVal; + } + throw new ApiError(message, response.status, body); + } + + return (await response.json()) as TranscribeResponse; +} diff --git a/apps/web/lib/keyboard/constants.ts b/apps/web/lib/keyboard/constants.ts index 31271c0b9..e05ab2373 100644 --- a/apps/web/lib/keyboard/constants.ts +++ b/apps/web/lib/keyboard/constants.ts @@ -153,4 +153,10 @@ export const SHORTCUTS = { key: KEYS.F, modifiers: { ctrlOrCmd: true }, }, + // Cmd+Shift+M starts/stops voice input on the chat composer. The default + // is configurable per-user via the Voice Mode settings page. + VOICE_INPUT_TOGGLE: { + key: KEYS.M, + modifiers: { ctrlOrCmd: true, shift: true }, + }, } as const; diff --git a/apps/web/lib/keyboard/shortcut-overrides.test.ts b/apps/web/lib/keyboard/shortcut-overrides.test.ts index 6453bc902..43c59c3df 100644 --- a/apps/web/lib/keyboard/shortcut-overrides.test.ts +++ b/apps/web/lib/keyboard/shortcut-overrides.test.ts @@ -20,7 +20,8 @@ describe("CONFIGURABLE_SHORTCUTS", () => { expect(ids).toContain("FOCUS_INPUT"); expect(ids).toContain("TOGGLE_PLAN_MODE"); expect(ids).toContain("TASK_SWITCHER"); - expect(ids).toHaveLength(10); + expect(ids).toContain("VOICE_INPUT_TOGGLE"); + expect(ids).toHaveLength(11); }); it("each entry has a label and default matching SHORTCUTS", () => { diff --git a/apps/web/lib/keyboard/shortcut-overrides.ts b/apps/web/lib/keyboard/shortcut-overrides.ts index 8ac1b7a37..a31d61e15 100644 --- a/apps/web/lib/keyboard/shortcut-overrides.ts +++ b/apps/web/lib/keyboard/shortcut-overrides.ts @@ -10,7 +10,8 @@ export type ConfigurableShortcutId = | "NEW_TASK" | "FOCUS_INPUT" | "TOGGLE_PLAN_MODE" - | "TASK_SWITCHER"; + | "TASK_SWITCHER" + | "VOICE_INPUT_TOGGLE"; export type StoredShortcutOverrides = Record< string, @@ -31,6 +32,7 @@ export const CONFIGURABLE_SHORTCUTS: Record< FOCUS_INPUT: { label: "Focus Chat Input", default: SHORTCUTS.FOCUS_INPUT }, TOGGLE_PLAN_MODE: { label: "Toggle Plan Mode", default: SHORTCUTS.TOGGLE_PLAN_MODE }, TASK_SWITCHER: { label: "Recent Task Switcher", default: SHORTCUTS.TASK_SWITCHER }, + VOICE_INPUT_TOGGLE: { label: "Voice Input", default: SHORTCUTS.VOICE_INPUT_TOGGLE }, }; export function getShortcut( diff --git a/apps/web/lib/ssr/user-settings.test.ts b/apps/web/lib/ssr/user-settings.test.ts index 04f425b0d..38b681b6e 100644 --- a/apps/web/lib/ssr/user-settings.test.ts +++ b/apps/web/lib/ssr/user-settings.test.ts @@ -1,5 +1,10 @@ import { describe, it, expect } from "vitest"; -import { buildCoreFields, mapUserSettingsResponse, parseChangesPanelLayout } from "./user-settings"; +import { + buildCoreFields, + mapUserSettingsResponse, + parseChangesPanelLayout, + parseVoiceMode, +} from "./user-settings"; describe("buildCoreFields", () => { it("maps terminal_font_family to terminalFontFamily", () => { @@ -103,3 +108,78 @@ describe("parseChangesPanelLayout", () => { expect(parseChangesPanelLayout("")).toBe("flat"); }); }); + +describe("parseVoiceMode", () => { + it("maps every field from the snake_case wire payload", () => { + expect( + parseVoiceMode({ + enabled: false, + engine: "whisperWeb", + language: "pt-PT", + mode: "hold", + auto_send: true, + whisper_web_model: "small", + }), + ).toEqual({ + enabled: false, + engine: "whisperWeb", + language: "pt-PT", + mode: "hold", + autoSend: true, + whisperWebModel: "small", + }); + }); + + it("returns the defaults when the payload is undefined", () => { + expect(parseVoiceMode(undefined)).toEqual({ + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }); + }); + + it("defaults enabled to true when the wire payload omits the field (old rows)", () => { + const result = parseVoiceMode({ + engine: "auto", + language: "auto", + mode: "toggle", + auto_send: false, + whisper_web_model: "base", + } as unknown as Parameters[0]); + expect(result.enabled).toBe(true); + }); + + it("fills in defaults for missing string fields and coerces auto_send to false", () => { + const result = parseVoiceMode({ + engine: "" as unknown as "auto", + language: "", + mode: "" as unknown as "toggle", + whisper_web_model: "" as unknown as "base", + } as unknown as Parameters[0]); + expect(result).toEqual({ + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }); + }); +}); + +describe("mapUserSettingsResponse voice mode", () => { + it("defaults the whole voiceMode object when response is null", () => { + const result = mapUserSettingsResponse(null); + expect(result.voiceMode).toEqual({ + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }); + }); +}); diff --git a/apps/web/lib/ssr/user-settings.ts b/apps/web/lib/ssr/user-settings.ts index 74a3d127f..b2ed73508 100644 --- a/apps/web/lib/ssr/user-settings.ts +++ b/apps/web/lib/ssr/user-settings.ts @@ -1,6 +1,8 @@ import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire"; import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types"; +import { DEFAULT_VOICE_MODE_STATE, type VoiceModeState } from "@/lib/state/slices/settings/types"; import type { SavedLayout, UserSettingsResponse } from "@/lib/types/http"; +import type { VoiceModeSettings } from "@/lib/types/http-voice"; export type UserSettingsData = NonNullable; @@ -12,6 +14,25 @@ export function parseChangesPanelLayout(value: string | undefined): "flat" | "tr return value === "tree" ? "tree" : "flat"; } +/** + * Maps the backend's snake_case VoiceMode payload into the camelCase shape + * the store and UI use. Missing or partial payloads fall back to the defaults + * so an old user row (written before VoiceMode existed) doesn't surface as + * an empty string the radio groups can't render. `enabled` defaults to true + * for users who haven't toggled it — voice mode is opt-out, not opt-in. + */ +export function parseVoiceMode(value: VoiceModeSettings | undefined): VoiceModeState { + if (!value) return { ...DEFAULT_VOICE_MODE_STATE }; + return { + enabled: typeof value.enabled === "boolean" ? value.enabled : true, + engine: value.engine || DEFAULT_VOICE_MODE_STATE.engine, + language: value.language || DEFAULT_VOICE_MODE_STATE.language, + mode: value.mode || DEFAULT_VOICE_MODE_STATE.mode, + autoSend: typeof value.auto_send === "boolean" ? value.auto_send : false, + whisperWebModel: value.whisper_web_model || DEFAULT_VOICE_MODE_STATE.whisperWebModel, + }; +} + function buildTerminalFields(s: UserSettingsData) { return { terminalLinkBehavior: parseTerminalLinkBehavior(s.terminal_link_behavior), @@ -21,6 +42,10 @@ function buildTerminalFields(s: UserSettingsData) { }; } +function buildVoiceModeFields(s: UserSettingsData) { + return { voiceMode: parseVoiceMode(s.voice_mode) }; +} + function buildIdentityFields(s: UserSettingsData) { return { workspaceId: s.workspace_id || null, @@ -51,6 +76,7 @@ export function buildCoreFields(s: UserSettingsData) { savedLayouts: s.saved_layouts ?? [], sidebarViews: (s.sidebar_views ?? []).map(fromApiSidebarView) as SidebarView[], ...buildTerminalFields(s), + ...buildVoiceModeFields(s), }; } @@ -91,6 +117,7 @@ export function mapUserSettingsResponse(response: UserSettingsResponse | null) { terminalFontFamily: null, terminalFontSize: null, changesPanelLayout: "flat" as const, + voiceMode: { ...DEFAULT_VOICE_MODE_STATE }, ...buildLspFields(undefined), loaded: false, }; diff --git a/apps/web/lib/state/slices/settings/settings-slice.ts b/apps/web/lib/state/slices/settings/settings-slice.ts index 26ce9c67b..d9dca4acb 100644 --- a/apps/web/lib/state/slices/settings/settings-slice.ts +++ b/apps/web/lib/state/slices/settings/settings-slice.ts @@ -1,5 +1,5 @@ import type { StateCreator } from "zustand"; -import type { SettingsSlice, SettingsSliceState } from "./types"; +import { DEFAULT_VOICE_MODE_STATE, type SettingsSlice, type SettingsSliceState } from "./types"; export const defaultSettingsState: SettingsSliceState = { executors: { items: [] }, @@ -44,6 +44,7 @@ export const defaultSettingsState: SettingsSliceState = { terminalFontFamily: null, terminalFontSize: null, changesPanelLayout: "flat", + voiceMode: { ...DEFAULT_VOICE_MODE_STATE }, loaded: false, }, }; diff --git a/apps/web/lib/state/slices/settings/types.ts b/apps/web/lib/state/slices/settings/types.ts index 73f094761..ca7740a93 100644 --- a/apps/web/lib/state/slices/settings/types.ts +++ b/apps/web/lib/state/slices/settings/types.ts @@ -11,6 +11,11 @@ import type { SavedLayout, ToolStatus, } from "@/lib/types/http"; +import type { + VoiceInputActivationMode, + VoiceInputEngine, + WhisperWebModelSize, +} from "@/lib/types/http-voice"; import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types"; import type { SecretListItem } from "@/lib/types/http-secrets"; import type { SpritesStatus, SpritesInstance } from "@/lib/types/http-sprites"; @@ -156,9 +161,29 @@ export type UserSettingsState = { terminalFontFamily: string | null; terminalFontSize: number | null; changesPanelLayout: "flat" | "tree"; + voiceMode: VoiceModeState; loaded: boolean; }; +export type VoiceModeState = { + enabled: boolean; + engine: VoiceInputEngine; + language: string; + mode: VoiceInputActivationMode; + autoSend: boolean; + whisperWebModel: WhisperWebModelSize; +}; + +/** Default values used by the slice init and by SSR hydration fallback. */ +export const DEFAULT_VOICE_MODE_STATE: VoiceModeState = { + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", +}; + export type SettingsSliceState = { executors: ExecutorsState; settingsAgents: SettingsAgentsState; diff --git a/apps/web/lib/types/backend.ts b/apps/web/lib/types/backend.ts index c97912e3c..448dba0f9 100644 --- a/apps/web/lib/types/backend.ts +++ b/apps/web/lib/types/backend.ts @@ -383,6 +383,7 @@ export type UserSettingsUpdatedPayload = { keyboard_shortcuts?: Record }>; terminal_link_behavior?: string; changes_panel_layout?: "flat" | "tree"; + voice_mode?: import("@/lib/types/http-voice").VoiceModeSettings; updated_at?: string; }; diff --git a/apps/web/lib/types/http-voice.ts b/apps/web/lib/types/http-voice.ts new file mode 100644 index 000000000..c43351524 --- /dev/null +++ b/apps/web/lib/types/http-voice.ts @@ -0,0 +1,17 @@ +/** + * Wire types for the Voice Mode user settings. Kept in their own module so + * http.ts stays under the 600-line file limit. + */ + +export type VoiceInputEngine = "auto" | "webSpeech" | "whisperWeb" | "whisperServer"; +export type VoiceInputActivationMode = "toggle" | "hold"; +export type WhisperWebModelSize = "tiny" | "base" | "small"; + +export type VoiceModeSettings = { + enabled: boolean; + engine: VoiceInputEngine; + language: string; + mode: VoiceInputActivationMode; + auto_send: boolean; + whisper_web_model: WhisperWebModelSize; +}; diff --git a/apps/web/lib/types/http.ts b/apps/web/lib/types/http.ts index fae94bf0c..0953be4c3 100644 --- a/apps/web/lib/types/http.ts +++ b/apps/web/lib/types/http.ts @@ -406,6 +406,8 @@ export type SidebarViewApi = { collapsed_groups: string[]; }; +import type { VoiceModeSettings } from "./http-voice"; + export type UserSettings = { user_id: string; workspace_id: WorkspaceId; @@ -432,6 +434,7 @@ export type UserSettings = { terminal_font_family?: string; terminal_font_size?: number; changes_panel_layout?: "flat" | "tree"; + voice_mode?: VoiceModeSettings; updated_at: string; }; diff --git a/apps/web/lib/voice/capabilities.test.ts b/apps/web/lib/voice/capabilities.test.ts new file mode 100644 index 000000000..d8b8d7191 --- /dev/null +++ b/apps/web/lib/voice/capabilities.test.ts @@ -0,0 +1,97 @@ +import { describe, it, expect, afterEach, vi } from "vitest"; +import { detectVoiceCapabilities, resolveActiveEngine } from "./capabilities"; + +describe("detectVoiceCapabilities", () => { + afterEach(() => { + vi.unstubAllGlobals(); + delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition; + delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition; + delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; + }); + + it("reports webSpeech true when window.SpeechRecognition exists", () => { + (window as unknown as { SpeechRecognition: () => void }).SpeechRecognition = () => {}; + expect(detectVoiceCapabilities().webSpeech).toBe(true); + }); + + it("reports webSpeech true on the prefixed webkit variant too", () => { + (window as unknown as { webkitSpeechRecognition: () => void }).webkitSpeechRecognition = + () => {}; + expect(detectVoiceCapabilities().webSpeech).toBe(true); + }); + + it("reports audioCapture true when MediaRecorder + getUserMedia are present", () => { + (window as unknown as { MediaRecorder: object }).MediaRecorder = { + isTypeSupported: () => true, + }; + vi.stubGlobal("navigator", { mediaDevices: { getUserMedia: () => Promise.resolve({}) } }); + expect(detectVoiceCapabilities().audioCapture).toBe(true); + }); + + it("reports everything false when no APIs are available", () => { + vi.stubGlobal("navigator", {}); + expect(detectVoiceCapabilities()).toEqual({ + webSpeech: false, + whisperWeb: false, + audioCapture: false, + }); + }); +}); + +describe("resolveActiveEngine", () => { + const allAvailable = { webSpeech: true, whisperWeb: true, audioCapture: true }; + + it("auto picks webSpeech first when available", () => { + expect(resolveActiveEngine("auto", allAvailable, true)).toBe("webSpeech"); + }); + + it("auto falls back to whisperWeb when webSpeech is missing", () => { + expect( + resolveActiveEngine("auto", { webSpeech: false, whisperWeb: true, audioCapture: true }, true), + ).toBe("whisperWeb"); + }); + + it("auto falls back to whisperServer when no in-browser engine is available", () => { + expect( + resolveActiveEngine( + "auto", + { webSpeech: false, whisperWeb: false, audioCapture: true }, + true, + ), + ).toBe("whisperServer"); + }); + + it("returns null when nothing is usable", () => { + expect( + resolveActiveEngine( + "auto", + { webSpeech: false, whisperWeb: false, audioCapture: false }, + true, + ), + ).toBeNull(); + }); + + it("honors a pinned engine when usable", () => { + expect(resolveActiveEngine("whisperWeb", allAvailable, true)).toBe("whisperWeb"); + }); + + it("falls back along the auto order when the pinned engine is missing", () => { + expect( + resolveActiveEngine( + "whisperWeb", + { webSpeech: true, whisperWeb: false, audioCapture: true }, + true, + ), + ).toBe("webSpeech"); + }); + + it("treats whisperServer as unusable when serverFallbackEnabled is false", () => { + expect( + resolveActiveEngine( + "whisperServer", + { webSpeech: false, whisperWeb: false, audioCapture: true }, + false, + ), + ).toBeNull(); + }); +}); diff --git a/apps/web/lib/voice/capabilities.ts b/apps/web/lib/voice/capabilities.ts new file mode 100644 index 000000000..6fd36f161 --- /dev/null +++ b/apps/web/lib/voice/capabilities.ts @@ -0,0 +1,75 @@ +"use client"; + +import type { VoiceInputEngine } from "@/lib/types/http-voice"; + +/** + * Capability report for the voice-mode engines available in the current + * browser. Shared between `useVoiceInput` (which picks the active engine) + * and the Voice Mode settings page (which decides which options to render). + */ +export type VoiceCapabilities = { + webSpeech: boolean; + whisperWeb: boolean; + /** True if the browser supports MediaRecorder + getUserMedia, the floor + * for any audio-capture engine (whisperWeb + whisperServer). */ + audioCapture: boolean; +}; + +/** + * Detects which voice engines this browser can run. Safe to call during + * SSR — returns all-false instead of throwing on missing globals. + */ +export function detectVoiceCapabilities(): VoiceCapabilities { + if (typeof window === "undefined") { + return { webSpeech: false, whisperWeb: false, audioCapture: false }; + } + const w = window as Window & { + SpeechRecognition?: unknown; + webkitSpeechRecognition?: unknown; + }; + const webSpeech = !!(w.SpeechRecognition || w.webkitSpeechRecognition); + const audioCapture = + typeof navigator !== "undefined" && + typeof navigator.mediaDevices?.getUserMedia === "function" && + typeof window.MediaRecorder !== "undefined"; + // whisper-web piggybacks on transformers.js which only needs a Worker plus + // either WebGPU or WebAssembly. Every modern browser has both, so the + // gating constraint is having MediaRecorder for capture. + const whisperWeb = audioCapture && typeof Worker !== "undefined"; + return { webSpeech, whisperWeb, audioCapture }; +} + +/** + * Resolves the active voice-input engine given a user preference and the + * detected capabilities. Returns null when nothing usable is available. + * + * Auto-fallback order: Web Speech (cheapest, native) → Whisper Web (private, + * heavier) → Whisper Server (always works but requires a configured server). + * If the user pinned a specific engine that isn't available, we degrade + * gracefully along the same order. + */ +export function resolveActiveEngine( + preference: VoiceInputEngine, + caps: VoiceCapabilities, + serverFallbackEnabled: boolean, +): Exclude | null { + const order: Array> = [ + "webSpeech", + "whisperWeb", + "whisperServer", + ]; + + const isUsable = (e: Exclude) => { + if (e === "webSpeech") return caps.webSpeech; + if (e === "whisperWeb") return caps.whisperWeb; + return caps.audioCapture && serverFallbackEnabled; + }; + + if (preference === "auto") { + return order.find(isUsable) ?? null; + } + if (isUsable(preference)) return preference; + // Pinned engine isn't usable — fall through to the next available one in + // the auto order so the button still works instead of silently no-op. + return order.find(isUsable) ?? null; +} diff --git a/apps/web/lib/voice/whisper-web-client.ts b/apps/web/lib/voice/whisper-web-client.ts new file mode 100644 index 000000000..0922d4864 --- /dev/null +++ b/apps/web/lib/voice/whisper-web-client.ts @@ -0,0 +1,186 @@ +"use client"; + +import { whisperModelConfig } from "./whisper-web-models"; +import type { WhisperWebModelSize } from "@/lib/types/http-voice"; + +/** + * Sample rate Whisper expects. We resample the captured audio to this rate + * (mono Float32Array) before sending to the worker — Whisper's own decoder + * would do this too, but doing it here keeps the worker focused on inference. + */ +const WHISPER_SAMPLE_RATE = 16000; + +export type WhisperWebProgress = { + stage: string; + progress: number; +}; + +export type WhisperWebHandlers = { + onProgress?: (p: WhisperWebProgress) => void; +}; + +type WorkerMessage = + | { type: "progress"; stage: string; progress: number } + | { type: "ready" } + | { type: "result"; text: string } + | { type: "error"; message: string }; + +type Pending = { + kind: "init" | "transcribe"; + resolve: (value: string | undefined) => void; + reject: (err: Error) => void; +}; + +/** + * Client wrapper around the whisper-web worker. Hides the postMessage + * protocol behind a clean promise-based API and handles the audio decode + + * resample step so callers only see "Blob in, transcript out". + */ +export class WhisperWebClient { + private worker: Worker | null = null; + private pending: Pending | null = null; + private ready = false; + private loadingModelId: string | null = null; + + constructor(private handlers: WhisperWebHandlers = {}) {} + + /** + * Lazy-creates the worker on first use. Returns a promise that resolves + * when the requested model is loaded and ready to transcribe. + */ + async init(size: WhisperWebModelSize): Promise { + const config = whisperModelConfig(size); + if (this.ready && this.loadingModelId === config.modelId) return; + this.ensureWorker(); + this.loadingModelId = config.modelId; + this.ready = false; + await this.send({ kind: "init", payload: { type: "init", model: config.modelId } }); + this.ready = true; + } + + /** + * Transcribe a recorded blob. The blob may be in any container the browser + * can decode (audio/webm, audio/wav, audio/mp4, …) — we resample everything + * to 16 kHz mono Float32 before handing to the worker. + */ + async transcribe(blob: Blob, language?: string): Promise { + if (!this.ready || !this.worker) { + throw new Error("WhisperWebClient: not initialized"); + } + const audio = await blobToWhisperFloat32(blob); + const text = await this.send({ + kind: "transcribe", + payload: { type: "transcribe", audio, language }, + transfer: [audio.buffer], + }); + return text ?? ""; + } + + /** Tear down the worker and release the loaded model. */ + dispose(): void { + if (this.worker) { + try { + this.worker.postMessage({ type: "dispose" }); + } catch { + // ignore + } + this.worker.terminate(); + this.worker = null; + } + this.ready = false; + this.loadingModelId = null; + if (this.pending) { + this.pending.reject(new Error("WhisperWebClient disposed")); + this.pending = null; + } + } + + private ensureWorker() { + if (this.worker) return; + // The `new Worker(new URL(..., import.meta.url))` form is Next.js / webpack's + // recommended pattern — webpack handles the bundling and asset path. + this.worker = new Worker(new URL("../../workers/whisper-web.worker.ts", import.meta.url), { + type: "module", + }); + this.worker.addEventListener("message", (e: MessageEvent) => + this.handleMessage(e.data), + ); + this.worker.addEventListener("error", (e) => { + const err = new Error(e.message || "Whisper worker crashed"); + if (this.pending) { + this.pending.reject(err); + this.pending = null; + } + }); + } + + private send(args: { + kind: "init" | "transcribe"; + payload: object; + transfer?: Transferable[]; + }): Promise { + if (!this.worker) throw new Error("WhisperWebClient: worker not initialized"); + if (this.pending) { + return Promise.reject(new Error("WhisperWebClient: another request is in flight")); + } + return new Promise((resolve, reject) => { + this.pending = { kind: args.kind, resolve, reject }; + this.worker?.postMessage(args.payload, args.transfer ?? []); + }); + } + + private handleMessage(msg: WorkerMessage) { + if (msg.type === "progress") { + this.handlers.onProgress?.({ stage: msg.stage, progress: msg.progress }); + return; + } + const pending = this.pending; + if (!pending) return; + this.pending = null; + if (msg.type === "error") { + pending.reject(new Error(msg.message)); + return; + } + if (msg.type === "ready") { + pending.resolve(undefined); + return; + } + if (msg.type === "result") { + pending.resolve(msg.text); + } + } +} + +/** + * Decode an arbitrary audio Blob and return a Float32Array sampled at 16 kHz + * mono — the format Whisper expects. + */ +export async function blobToWhisperFloat32(blob: Blob): Promise { + const arrayBuffer = await blob.arrayBuffer(); + // Decode using an AudioContext at the source rate, then bounce through an + // OfflineAudioContext for the resample. AudioContext.decodeAudioData + // tolerates webm/opus, mp4/aac, wav, ogg — anything the browser can play. + const AudioCtor = + window.AudioContext ?? + (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext; + if (!AudioCtor) throw new Error("AudioContext is not available in this browser"); + const decodeCtx = new AudioCtor(); + let decoded: AudioBuffer; + try { + decoded = await decodeCtx.decodeAudioData(arrayBuffer); + } finally { + await decodeCtx.close(); + } + return resampleToMono16k(decoded); +} + +async function resampleToMono16k(buf: AudioBuffer): Promise { + const length = Math.ceil((buf.duration * WHISPER_SAMPLE_RATE) / 1); + const offline = new OfflineAudioContext(1, length, WHISPER_SAMPLE_RATE); + const source = offline.createBufferSource(); + source.buffer = buf; + source.connect(offline.destination); + source.start(0); + const rendered = await offline.startRendering(); + return rendered.getChannelData(0).slice(); +} diff --git a/apps/web/lib/voice/whisper-web-models.ts b/apps/web/lib/voice/whisper-web-models.ts new file mode 100644 index 000000000..c6d4629c8 --- /dev/null +++ b/apps/web/lib/voice/whisper-web-models.ts @@ -0,0 +1,41 @@ +import type { WhisperWebModelSize } from "@/lib/types/http-voice"; + +export type WhisperModelConfig = { + size: WhisperWebModelSize; + /** Hugging Face model id (the Xenova/* mirrors are pre-quantized for transformers.js). */ + modelId: string; + /** Rough on-disk size after download, shown in the settings UI. */ + approxBytes: number; + /** Human-readable label. */ + label: string; +}; + +// The `onnx-community/whisper-*` mirrors are the maintained transformers.js +// exports. The older `Xenova/whisper-*` mirrors default to 4-bit (`MatMulNBits`) +// weights that only run on WebGPU — on WASM they fail with +// `Missing required scale: ... weight_merged_0_scale`. The onnx-community +// mirrors include the q8 variant we pin to in the worker. +export const WHISPER_WEB_MODELS: Record = { + tiny: { + size: "tiny", + modelId: "onnx-community/whisper-tiny", + approxBytes: 40 * 1024 * 1024, + label: "Whisper Tiny", + }, + base: { + size: "base", + modelId: "onnx-community/whisper-base", + approxBytes: 75 * 1024 * 1024, + label: "Whisper Base", + }, + small: { + size: "small", + modelId: "onnx-community/whisper-small", + approxBytes: 240 * 1024 * 1024, + label: "Whisper Small", + }, +}; + +export function whisperModelConfig(size: WhisperWebModelSize): WhisperModelConfig { + return WHISPER_WEB_MODELS[size] ?? WHISPER_WEB_MODELS.base; +} diff --git a/apps/web/lib/ws/handlers/users.ts b/apps/web/lib/ws/handlers/users.ts index 1ddb7a71c..0b33698d3 100644 --- a/apps/web/lib/ws/handlers/users.ts +++ b/apps/web/lib/ws/handlers/users.ts @@ -1,6 +1,7 @@ import type { StoreApi } from "zustand"; import type { AppState } from "@/lib/state/store"; import type { WsHandlers } from "@/lib/ws/handlers/types"; +import { parseVoiceMode } from "@/lib/ssr/user-settings"; export function registerUsersHandlers(store: StoreApi): WsHandlers { return { @@ -31,6 +32,7 @@ export function registerUsersHandlers(store: StoreApi): WsHandlers { ? "browser_panel" : "new_tab", changesPanelLayout: message.payload.changes_panel_layout === "tree" ? "tree" : "flat", + voiceMode: parseVoiceMode(message.payload.voice_mode), loaded: true, }, })); diff --git a/apps/web/package.json b/apps/web/package.json index 7de93f075..369517e61 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -38,6 +38,7 @@ "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", "@dnd-kit/utilities": "^3.2.2", + "@huggingface/transformers": "^4.2.0", "@kandev/theme": "workspace:*", "@kandev/types": "workspace:*", "@kandev/ui": "workspace:*", diff --git a/apps/web/workers/whisper-web.worker.ts b/apps/web/workers/whisper-web.worker.ts new file mode 100644 index 000000000..b333796c0 --- /dev/null +++ b/apps/web/workers/whisper-web.worker.ts @@ -0,0 +1,138 @@ +/// + +/** + * Web Worker that runs OpenAI Whisper entirely in the browser via + * @huggingface/transformers (the maintained transformers.js library that + * xenova/whisper-web is built on). + * + * Lives in its own worker because model loading + inference both block the + * main thread for several seconds — would freeze the chat input otherwise. + * + * Wire protocol (postMessage): + * in: { type: "init", model: "Xenova/whisper-base" } + * in: { type: "transcribe", audio: Float32Array, language?: string } + * in: { type: "dispose" } + * out: { type: "progress", stage: string, progress: number } + * out: { type: "ready" } + * out: { type: "result", text: string } + * out: { type: "error", message: string } + */ + +import { pipeline, env, type AutomaticSpeechRecognitionPipeline } from "@huggingface/transformers"; + +// Disable transformers.js's local-models lookup — we only load from the HF +// CDN so the worker doesn't try to fetch files from our own origin. +env.allowLocalModels = false; +env.allowRemoteModels = true; + +type InitMessage = { type: "init"; model: string }; +type TranscribeMessage = { type: "transcribe"; audio: Float32Array; language?: string }; +type DisposeMessage = { type: "dispose" }; +type InMessage = InitMessage | TranscribeMessage | DisposeMessage; + +type OutMessage = + | { type: "progress"; stage: string; progress: number } + | { type: "ready" } + | { type: "result"; text: string } + | { type: "error"; message: string }; + +const ctx = self as unknown as DedicatedWorkerGlobalScope; + +let asrPipeline: AutomaticSpeechRecognitionPipeline | null = null; +let activeModelId: string | null = null; + +function post(message: OutMessage) { + ctx.postMessage(message); +} + +type ProgressEvent = { + status?: string; + file?: string; + progress?: number; +}; + +async function handleInit(msg: InitMessage) { + if (asrPipeline && activeModelId === msg.model) { + post({ type: "ready" }); + return; + } + if (asrPipeline) { + await asrPipeline.dispose(); + asrPipeline = null; + } + try { + // dtype choice rationale: the `_quantized` / `q8` and `q4` decoder weights + // for whisper-base both contain `MatMulNBits` ops that only execute on + // WebGPU. On browsers without WebGPU (most Firefox, older Chrome) onnxruntime + // throws `Missing required scale: ... weight_merged_0_scale`. fp16 has no + // quantized ops at all so it works on both WASM and WebGPU; it's ~half the + // size of fp32 with no perceptible accuracy loss for ASR. + const created = await pipeline("automatic-speech-recognition", msg.model, { + dtype: { + encoder_model: "fp32", + decoder_model_merged: "fp16", + }, + progress_callback: (e: ProgressEvent) => { + if (typeof e?.progress === "number") { + post({ + type: "progress", + stage: e.status ?? "download", + progress: e.progress, + }); + } + }, + }); + asrPipeline = created as AutomaticSpeechRecognitionPipeline; + activeModelId = msg.model; + post({ type: "ready" }); + } catch (err) { + post({ type: "error", message: errorMessage(err) }); + } +} + +async function handleTranscribe(msg: TranscribeMessage) { + if (!asrPipeline) { + post({ type: "error", message: "Whisper worker not initialized" }); + return; + } + try { + const result = (await asrPipeline(msg.audio, { + language: msg.language && msg.language !== "auto" ? msg.language : undefined, + task: "transcribe", + })) as { text?: string } | Array<{ text?: string }>; + const text = Array.isArray(result) + ? result.map((r) => r.text ?? "").join(" ") + : (result.text ?? ""); + post({ type: "result", text: text.trim() }); + } catch (err) { + post({ type: "error", message: errorMessage(err) }); + } +} + +async function handleDispose() { + if (asrPipeline) { + await asrPipeline.dispose(); + asrPipeline = null; + activeModelId = null; + } +} + +function errorMessage(err: unknown): string { + if (err instanceof Error) return err.message; + return String(err); +} + +ctx.addEventListener("message", (event: MessageEvent) => { + const msg = event.data; + switch (msg.type) { + case "init": + void handleInit(msg); + break; + case "transcribe": + void handleTranscribe(msg); + break; + case "dispose": + void handleDispose(); + break; + } +}); From 5157743232db6a0a59af451d95bb3c8934b3a809 Mon Sep 17 00:00:00 2001 From: Kandev Agent Date: Fri, 29 May 2026 17:58:05 +0100 Subject: [PATCH 2/4] fix: address voice mode PR review feedback and E2E selector clash Bot-review fixes from #1159: - Map http.MaxBytesError from c.FormFile to 413 instead of bare 400. - Strip BCP-47 region suffix before passing language hint to Whisper Web. - Recover from worker crash: terminate + null worker + reset ready flag so next init/transcribe creates a fresh worker instead of hanging. - Read live store state via storeApi.getState() in voice-mode settings savers to avoid stale-closure clobbering concurrent settings updates. - Detach Web Speech callbacks before abort() so trailing onend/onerror events don't mutate hook state after cancel(). - Spread caller init before method/body in voice-api fetch so a caller- supplied init can't override the multipart POST. - Replace t.Fatalf with t.Errorf+return inside test HTTP handler goroutine (FailNow from non-test goroutine is unsafe). - Add Enabled/AutoSend zeroing assertions in partial-update test so the bool-overwrite-on-PATCH behavior is locked in explicitly. E2E fix: - toolbar-overflow spec used button.rounded-full which now matches both the send and voice buttons (strict-mode violation). Switched to the submit-message-button testid for an unambiguous locator. --- .../internal/user/service/service_test.go | 16 +++++++-- .../voice/handlers/transcribe_handlers.go | 14 ++++++-- .../internal/voice/transcribe/service_test.go | 9 +++-- .../settings/voice-mode-settings.tsx | 36 +++++++++++++------ .../e2e/tests/chat/toolbar-overflow.spec.ts | 6 ++-- apps/web/hooks/use-voice-input.ts | 21 ++++++++--- apps/web/lib/api/domains/voice-api.ts | 6 ++-- apps/web/lib/voice/whisper-web-client.ts | 8 +++++ 8 files changed, 90 insertions(+), 26 deletions(-) diff --git a/apps/backend/internal/user/service/service_test.go b/apps/backend/internal/user/service/service_test.go index ae01c50d6..4a460d00e 100644 --- a/apps/backend/internal/user/service/service_test.go +++ b/apps/backend/internal/user/service/service_test.go @@ -512,16 +512,22 @@ func TestApplyVoiceMode(t *testing.T) { } }) - t.Run("partial update preserves untouched fields", func(t *testing.T) { + t.Run("partial update preserves string fields but zeroes booleans", func(t *testing.T) { settings := &models.UserSettings{ VoiceMode: models.VoiceModeSettings{ + Enabled: true, Engine: "whisperServer", Language: "en-GB", Mode: "toggle", + AutoSend: true, WhisperWebModel: "tiny", }, } - // Empty strings on the new payload mean "no change" for those fields. + // Empty strings on the new payload mean "no change" for the string fields, + // but bools have no "unset" sentinel — every PATCH carries them. The settings + // UI always sends the full VoiceMode object so partial updates here would + // only happen in test or hand-crafted requests; the assertions below lock in + // that explicit behavior so it doesn't drift silently. err := applyVoiceMode(settings, &models.VoiceModeSettings{Engine: "webSpeech"}) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -538,5 +544,11 @@ func TestApplyVoiceMode(t *testing.T) { if settings.VoiceMode.WhisperWebModel != "tiny" { t.Fatalf("expected whisper model preserved, got %q", settings.VoiceMode.WhisperWebModel) } + if settings.VoiceMode.Enabled { + t.Fatalf("expected Enabled zeroed on partial update, got true") + } + if settings.VoiceMode.AutoSend { + t.Fatalf("expected AutoSend zeroed on partial update, got true") + } }) } diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers.go b/apps/backend/internal/voice/handlers/transcribe_handlers.go index 517ad2e26..bb9992e3a 100644 --- a/apps/backend/internal/voice/handlers/transcribe_handlers.go +++ b/apps/backend/internal/voice/handlers/transcribe_handlers.go @@ -50,12 +50,20 @@ func (h *Handlers) httpTranscribe(c *gin.Context) { return } - // MaxBytesReader makes the io.ReadAll below short-circuit with an error - // once the cap is exceeded, instead of letting Gin buffer the whole body. + // MaxBytesReader caps multipart parsing — once the cap is exceeded, Gin's + // multipart parser surfaces *http.MaxBytesError out of c.FormFile (because + // it reads the whole body through the wrapped reader before we ever get + // the *FileHeader). We need to distinguish that case from a genuinely + // missing field so the client sees 413 instead of a misleading 400. c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, maxAudioBytes) fh, err := c.FormFile("audio") if err != nil { + var maxBytesErr *http.MaxBytesError + if errors.As(err, &maxBytesErr) { + c.JSON(http.StatusRequestEntityTooLarge, gin.H{"error": "audio payload too large"}) + return + } c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is required (multipart field 'audio')"}) return } @@ -71,7 +79,7 @@ func (h *Handlers) httpTranscribe(c *gin.Context) { data, err := io.ReadAll(file) if err != nil { h.log.Warn("read uploaded audio failed", zap.Error(err)) - c.JSON(http.StatusRequestEntityTooLarge, gin.H{"error": "audio payload too large or unreadable"}) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read uploaded audio"}) return } if len(data) == 0 { diff --git a/apps/backend/internal/voice/transcribe/service_test.go b/apps/backend/internal/voice/transcribe/service_test.go index c7eb0f1cb..b8c8315cf 100644 --- a/apps/backend/internal/voice/transcribe/service_test.go +++ b/apps/backend/internal/voice/transcribe/service_test.go @@ -53,14 +53,19 @@ func TestService_Transcribe_Success(t *testing.T) { } capturedModel = r.FormValue("model") capturedFormat = r.FormValue("response_format") + // Use Errorf + return inside the HTTP handler goroutine — t.Fatalf + // from a non-test goroutine triggers FailNow which panics rather than + // failing the test cleanly. fh := r.MultipartForm.File["file"] if len(fh) != 1 { - t.Fatalf("expected 1 file part, got %d", len(fh)) + t.Errorf("expected 1 file part, got %d", len(fh)) + return } capturedFilename = fh[0].Filename f, err := fh[0].Open() if err != nil { - t.Fatalf("open file: %v", err) + t.Errorf("open file: %v", err) + return } defer func() { _ = f.Close() }() capturedFileBytes, _ = io.ReadAll(f) diff --git a/apps/web/components/settings/voice-mode-settings.tsx b/apps/web/components/settings/voice-mode-settings.tsx index 610c85fc1..ff3a3683a 100644 --- a/apps/web/components/settings/voice-mode-settings.tsx +++ b/apps/web/components/settings/voice-mode-settings.tsx @@ -16,7 +16,7 @@ import { SelectValue, } from "@kandev/ui/select"; import { Switch } from "@kandev/ui/switch"; -import { useAppStore } from "@/components/state-provider"; +import { useAppStore, useAppStoreApi } from "@/components/state-provider"; import { useToast } from "@/components/toast-provider"; import { updateUserSettings } from "@/lib/api"; import { SettingsSection } from "@/components/settings/settings-section"; @@ -78,27 +78,36 @@ function toWire(state: VoiceModeState): VoiceModeWire { // ── Save hook ──────────────────────────────────────────────────────────── function useVoiceModeSaver() { - const userSettings = useAppStore((s) => s.userSettings); + // Read userSettings via the store API (not as a React selector) so the + // async save handler reads the latest snapshot at invocation time instead + // of capturing a stale closure. Without this, concurrent settings updates + // racing with this save (or a rejection rolling back to a stale snapshot) + // can silently overwrite unrelated fields. + const storeApi = useAppStoreApi(); const setUserSettings = useAppStore((s) => s.setUserSettings); const { toast } = useToast(); const [saving, setSaving] = useState(false); const save = useCallback( async (patch: Partial) => { - const previous = userSettings.voiceMode; + const current = storeApi.getState().userSettings; + const previous = current.voiceMode; const next = { ...previous, ...patch }; - setUserSettings({ ...userSettings, voiceMode: next }); + setUserSettings({ ...current, voiceMode: next }); setSaving(true); try { await updateUserSettings({ voice_mode: toWire(next) }); } catch { - setUserSettings({ ...userSettings, voiceMode: previous }); + // Rollback against the *current* state (which may have moved on + // since we kicked off the request), not against the original. + const latest = storeApi.getState().userSettings; + setUserSettings({ ...latest, voiceMode: previous }); toast({ title: "Failed to save Voice Mode setting", variant: "error" }); } finally { setSaving(false); } }, - [userSettings, setUserSettings, toast], + [storeApi, setUserSettings, toast], ); return { save, saving }; @@ -396,19 +405,24 @@ function AvailabilityBanner({ caps }: { caps: VoiceCapabilities }) { // ── Voice keyboard shortcut card ───────────────────────────────────────── function useShortcutSaver() { - const userSettings = useAppStore((s) => s.userSettings); + // Same stale-closure protection as useVoiceModeSaver — read live store + // state at call time so a concurrent keyboard-shortcut change from another + // settings card isn't clobbered by this card's optimistic update / rollback. + const storeApi = useAppStoreApi(); const setUserSettings = useAppStore((s) => s.setUserSettings); const { toast } = useToast(); return useCallback( (next: StoredShortcutOverrides) => { - const previous = userSettings.keyboardShortcuts; - setUserSettings({ ...userSettings, keyboardShortcuts: next }); + const current = storeApi.getState().userSettings; + const previous = current.keyboardShortcuts; + setUserSettings({ ...current, keyboardShortcuts: next }); updateUserSettings({ keyboard_shortcuts: next }).catch(() => { - setUserSettings({ ...userSettings, keyboardShortcuts: previous }); + const latest = storeApi.getState().userSettings; + setUserSettings({ ...latest, keyboardShortcuts: previous }); toast({ title: "Failed to save shortcut", variant: "error" }); }); }, - [userSettings, setUserSettings, toast], + [storeApi, setUserSettings, toast], ); } diff --git a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts index 3722d205b..fdbcbfb42 100644 --- a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts +++ b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts @@ -95,8 +95,10 @@ test.describe("Toolbar overflow menu", () => { // Context badge should be hidden when collapsed to avoid clipping await expect(contextBadge).not.toBeVisible(); - // Submit button should remain visible (always-visible item) - const submitBtn = toolbar.locator("button.rounded-full"); + // Submit button should remain visible (always-visible item). Target the + // submit testid specifically — the voice input button is also round, so a + // bare `button.rounded-full` locator now matches both and fails strict mode. + const submitBtn = toolbar.getByTestId("submit-message-button"); await expect(submitBtn).toBeVisible(); // Click expand toggle — items appear inline (scrollable) diff --git a/apps/web/hooks/use-voice-input.ts b/apps/web/hooks/use-voice-input.ts index 7f11b79cf..60c508aca 100644 --- a/apps/web/hooks/use-voice-input.ts +++ b/apps/web/hooks/use-voice-input.ts @@ -127,8 +127,14 @@ function resolveLang(preference: string): string { } function resolveWhisperLang(preference: string): string | undefined { - if (preference && preference !== "auto") return preference; - return undefined; + if (!preference || preference === "auto") return undefined; + // Whisper's tokenizer only knows ISO 639-1 two-letter codes ("en", "pt"). + // The settings UI stores BCP-47 ("en-US", "pt-BR") so we can render + // human-friendly variant names — strip the region suffix here so the hint + // isn't silently dropped by the pipeline (which would then auto-detect and + // potentially pick the wrong dialect). + const dash = preference.indexOf("-"); + return dash > 0 ? preference.slice(0, dash).toLowerCase() : preference.toLowerCase(); } // ── MediaRecorder capture primitive ───────────────────────────────────── @@ -209,8 +215,15 @@ type WhisperRefBox = { current: WhisperWebClient | null }; function abortDriver(ref: DriverRefBox) { const driver = ref.current; if (!driver) return; - if (driver.kind === "webSpeech") driver.recognition.abort(); - else teardownCapture(driver.handle); + if (driver.kind === "webSpeech") { + // Detach callbacks before aborting so the trailing onerror/onend events + // that some browsers fire after .abort() don't sneak through and mutate + // hook state that the caller (cancel()) just reset. + driver.recognition.onresult = null; + driver.recognition.onerror = null; + driver.recognition.onend = null; + driver.recognition.abort(); + } else teardownCapture(driver.handle); ref.current = null; } diff --git a/apps/web/lib/api/domains/voice-api.ts b/apps/web/lib/api/domains/voice-api.ts index 88e65def5..d3af1a571 100644 --- a/apps/web/lib/api/domains/voice-api.ts +++ b/apps/web/lib/api/domains/voice-api.ts @@ -23,11 +23,13 @@ export async function transcribeAudio( formData.append("audio", blob, filename); // Do NOT set Content-Type: the browser sets multipart/form-data with the - // correct boundary automatically when given a FormData body. + // correct boundary automatically when given a FormData body. Spread caller + // init *first* so method/body always win — otherwise a caller passing + // `init: { method: "GET" }` (or a stale body) would silently break the upload. const response = await fetch(`${baseUrl}/api/v1/transcribe`, { + ...options?.init, method: "POST", body: formData, - ...options?.init, }); if (!response.ok) { diff --git a/apps/web/lib/voice/whisper-web-client.ts b/apps/web/lib/voice/whisper-web-client.ts index 0922d4864..e55cd9dc6 100644 --- a/apps/web/lib/voice/whisper-web-client.ts +++ b/apps/web/lib/voice/whisper-web-client.ts @@ -107,6 +107,14 @@ export class WhisperWebClient { ); this.worker.addEventListener("error", (e) => { const err = new Error(e.message || "Whisper worker crashed"); + // Tear the dead worker down so the next init()/transcribe() recreates + // a fresh one. Without this, ensureWorker() short-circuits on the + // still-truthy reference, posts to nothing, and the new request hangs + // forever — bricking voice input until a full page reload. + this.worker?.terminate(); + this.worker = null; + this.ready = false; + this.loadingModelId = null; if (this.pending) { this.pending.reject(err); this.pending = null; From 283c001cd881e8adb6a8d71b32c3937fc84093da Mon Sep 17 00:00:00 2001 From: Kandev Agent Date: Fri, 29 May 2026 18:22:42 +0100 Subject: [PATCH 3/4] fix: scope voice settings rollback + whisper worker error to changed keys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 of PR review feedback on #1159: - voice-mode settings rollback now restores only the keys patched by the failing request (not the whole snapshot), preserving concurrent edits to unrelated voiceMode fields. - Shortcut rollback diffs previous vs next and reverts only the changed keys (restore prior value or delete if absent), preserving other concurrent shortcut edits. - Whisper worker error handler captures the worker reference at attach time and only clears refs when this.worker still matches it — a stale error from a previously replaced worker no longer nukes the active one. - Update stale whisper-web-models JSDoc that still pointed at Xenova mirrors. --- .../settings/voice-mode-settings.tsx | 27 ++++++++++++++++--- apps/web/lib/voice/whisper-web-client.ts | 21 +++++++++------ apps/web/lib/voice/whisper-web-models.ts | 3 ++- 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/apps/web/components/settings/voice-mode-settings.tsx b/apps/web/components/settings/voice-mode-settings.tsx index ff3a3683a..8dcab78b9 100644 --- a/apps/web/components/settings/voice-mode-settings.tsx +++ b/apps/web/components/settings/voice-mode-settings.tsx @@ -98,10 +98,19 @@ function useVoiceModeSaver() { try { await updateUserSettings({ voice_mode: toWire(next) }); } catch { - // Rollback against the *current* state (which may have moved on - // since we kicked off the request), not against the original. + // Rollback only the keys this request changed — restoring the whole + // voiceMode snapshot would clobber any concurrent edits to other + // fields that landed while the request was in flight. const latest = storeApi.getState().userSettings; - setUserSettings({ ...latest, voiceMode: previous }); + const reverted: Partial = {}; + for (const key of Object.keys(patch) as Array) { + // Cast through unknown so the per-key assignment passes strict checks. + (reverted as Record)[key] = previous[key]; + } + setUserSettings({ + ...latest, + voiceMode: { ...latest.voiceMode, ...reverted }, + }); toast({ title: "Failed to save Voice Mode setting", variant: "error" }); } finally { setSaving(false); @@ -417,8 +426,18 @@ function useShortcutSaver() { const previous = current.keyboardShortcuts; setUserSettings({ ...current, keyboardShortcuts: next }); updateUserSettings({ keyboard_shortcuts: next }).catch(() => { + // Rollback only the keys this request changed, restoring their prior + // values (or deleting them if they didn't exist). Replacing the whole + // map would clobber unrelated shortcut edits that landed since. const latest = storeApi.getState().userSettings; - setUserSettings({ ...latest, keyboardShortcuts: previous }); + const restored: StoredShortcutOverrides = { ...latest.keyboardShortcuts }; + const changedKeys = new Set([...Object.keys(previous), ...Object.keys(next)]); + for (const key of changedKeys) { + if (previous[key] === next[key]) continue; + if (previous[key] === undefined) delete restored[key]; + else restored[key] = previous[key]; + } + setUserSettings({ ...latest, keyboardShortcuts: restored }); toast({ title: "Failed to save shortcut", variant: "error" }); }); }, diff --git a/apps/web/lib/voice/whisper-web-client.ts b/apps/web/lib/voice/whisper-web-client.ts index e55cd9dc6..e9d1cc620 100644 --- a/apps/web/lib/voice/whisper-web-client.ts +++ b/apps/web/lib/voice/whisper-web-client.ts @@ -105,16 +105,21 @@ export class WhisperWebClient { this.worker.addEventListener("message", (e: MessageEvent) => this.handleMessage(e.data), ); + // Capture the worker reference at listener-attach time. A late error from + // a previously-disposed worker can still bubble up after we've already + // created its replacement; without the identity check below, that stale + // event would terminate the brand-new worker too. + const ownWorker = this.worker; this.worker.addEventListener("error", (e) => { const err = new Error(e.message || "Whisper worker crashed"); - // Tear the dead worker down so the next init()/transcribe() recreates - // a fresh one. Without this, ensureWorker() short-circuits on the - // still-truthy reference, posts to nothing, and the new request hangs - // forever — bricking voice input until a full page reload. - this.worker?.terminate(); - this.worker = null; - this.ready = false; - this.loadingModelId = null; + ownWorker?.terminate(); + // Only clear our refs if this is still the active worker — a stale + // error from a worker we already replaced must not nuke the new one. + if (this.worker === ownWorker) { + this.worker = null; + this.ready = false; + this.loadingModelId = null; + } if (this.pending) { this.pending.reject(err); this.pending = null; diff --git a/apps/web/lib/voice/whisper-web-models.ts b/apps/web/lib/voice/whisper-web-models.ts index c6d4629c8..eaffe6698 100644 --- a/apps/web/lib/voice/whisper-web-models.ts +++ b/apps/web/lib/voice/whisper-web-models.ts @@ -2,7 +2,8 @@ import type { WhisperWebModelSize } from "@/lib/types/http-voice"; export type WhisperModelConfig = { size: WhisperWebModelSize; - /** Hugging Face model id (the Xenova/* mirrors are pre-quantized for transformers.js). */ + /** Hugging Face model id. Use the `onnx-community/*` mirrors — `Xenova/*` + * defaults to 4-bit MatMulNBits weights that crash on WASM (see note below). */ modelId: string; /** Rough on-disk size after download, shown in the settings UI. */ approxBytes: number; From ffe88a69959b404b1104c7cd58fce15f945071b2 Mon Sep 17 00:00:00 2001 From: Kandev Agent Date: Fri, 29 May 2026 18:48:27 +0100 Subject: [PATCH 4/4] fix: voice mode progress scale, hold-mode race, race-aware rollback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 3 of PR review feedback on #1159: - Normalise Whisper progress to 0–1 in ensureWhisperClient (transformers.js emits 0–100, the button's display + the `ready: 1` convention expect 0–1 — fixes the 5000% mid-download display). - Claim driverRef synchronously at the top of finishCapture so concurrent pointerup + pointerleave invocations early-return instead of racing and clobbering a freshly-started recording's ref. - Race-aware rollback: voice-mode and shortcut catch branches now skip reverting a key when latest[key] !== next[key]. A failed earlier request no longer overwrites a later successful save to the same key. - Update the Worker wire-protocol comment to show onnx-community/whisper-base instead of the stale Xenova/* example. --- .../components/settings/voice-mode-settings.tsx | 15 +++++++++------ apps/web/hooks/use-voice-input.ts | 13 +++++++++++-- apps/web/workers/whisper-web.worker.ts | 2 +- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/apps/web/components/settings/voice-mode-settings.tsx b/apps/web/components/settings/voice-mode-settings.tsx index 8dcab78b9..1a0d702cd 100644 --- a/apps/web/components/settings/voice-mode-settings.tsx +++ b/apps/web/components/settings/voice-mode-settings.tsx @@ -98,12 +98,14 @@ function useVoiceModeSaver() { try { await updateUserSettings({ voice_mode: toWire(next) }); } catch { - // Rollback only the keys this request changed — restoring the whole - // voiceMode snapshot would clobber any concurrent edits to other - // fields that landed while the request was in flight. + // Rollback only the keys this request changed AND only when the live + // value still matches what we optimistically wrote. If a newer save + // for the same key landed first, that's now the truth — reverting + // would silently roll back the user's later edit. const latest = storeApi.getState().userSettings; const reverted: Partial = {}; for (const key of Object.keys(patch) as Array) { + if (latest.voiceMode[key] !== next[key]) continue; // Cast through unknown so the per-key assignment passes strict checks. (reverted as Record)[key] = previous[key]; } @@ -426,14 +428,15 @@ function useShortcutSaver() { const previous = current.keyboardShortcuts; setUserSettings({ ...current, keyboardShortcuts: next }); updateUserSettings({ keyboard_shortcuts: next }).catch(() => { - // Rollback only the keys this request changed, restoring their prior - // values (or deleting them if they didn't exist). Replacing the whole - // map would clobber unrelated shortcut edits that landed since. + // Rollback only the keys this request changed AND only when the live + // value still matches what we optimistically wrote. Skip otherwise so + // a newer successful save to the same key isn't reverted. const latest = storeApi.getState().userSettings; const restored: StoredShortcutOverrides = { ...latest.keyboardShortcuts }; const changedKeys = new Set([...Object.keys(previous), ...Object.keys(next)]); for (const key of changedKeys) { if (previous[key] === next[key]) continue; + if (latest.keyboardShortcuts[key] !== next[key]) continue; if (previous[key] === undefined) delete restored[key]; else restored[key] = previous[key]; } diff --git a/apps/web/hooks/use-voice-input.ts b/apps/web/hooks/use-voice-input.ts index 60c508aca..454df30f8 100644 --- a/apps/web/hooks/use-voice-input.ts +++ b/apps/web/hooks/use-voice-input.ts @@ -306,9 +306,14 @@ type FinishCaptureHandlers = { async function finishCapture(h: FinishCaptureHandlers): Promise { const driver = h.driverRef.current; if (!driver || driver.kind !== "capture") return; + // Claim the driver synchronously *before* the first await. In hold mode, + // pointerup + pointerleave both fire in the same task and both call stop(); + // without this early null, the second invocation would also enter + // finishCapture, race the first, and could clobber a brand-new recording's + // driverRef if the user re-triggered between them. + h.driverRef.current = null; h.setState("processing"); const blob = await stopCapture(driver.handle); - h.driverRef.current = null; if (!blob) { h.setState("idle"); return; @@ -341,7 +346,11 @@ async function ensureWhisperClient(h: FinishCaptureHandlers): Promise - h.setModelLoad({ state: "loading", progress: p.progress }), + // transformers.js emits progress on a 0–100 scale, but the rest of the + // pipeline (and the button's `* 100` display) treats `modelLoad.progress` + // as a 0–1 fraction (matching the `ready: 1` convention below). Normalise + // here so the button doesn't render "5000%" mid-download. + h.setModelLoad({ state: "loading", progress: p.progress / 100 }), }); h.setModelLoad({ state: "loading", progress: 0 }); } diff --git a/apps/web/workers/whisper-web.worker.ts b/apps/web/workers/whisper-web.worker.ts index b333796c0..68fa33e4b 100644 --- a/apps/web/workers/whisper-web.worker.ts +++ b/apps/web/workers/whisper-web.worker.ts @@ -9,7 +9,7 @@ * main thread for several seconds — would freeze the chat input otherwise. * * Wire protocol (postMessage): - * in: { type: "init", model: "Xenova/whisper-base" } + * in: { type: "init", model: "onnx-community/whisper-base" } * in: { type: "transcribe", audio: Float32Array, language?: string } * in: { type: "dispose" } * out: { type: "progress", stage: string, progress: number }