diff --git a/apps/backend/cmd/kandev/helpers.go b/apps/backend/cmd/kandev/helpers.go index 58eeec1f5..dccbb750d 100644 --- a/apps/backend/cmd/kandev/helpers.go +++ b/apps/backend/cmd/kandev/helpers.go @@ -71,6 +71,8 @@ import ( userhandlers "github.com/kandev/kandev/internal/user/handlers" utilitycontroller "github.com/kandev/kandev/internal/utility/controller" utilityhandlers "github.com/kandev/kandev/internal/utility/handlers" + voicehandlers "github.com/kandev/kandev/internal/voice/handlers" + "github.com/kandev/kandev/internal/voice/transcribe" workflowcontroller "github.com/kandev/kandev/internal/workflow/controller" workflowhandlers "github.com/kandev/kandev/internal/workflow/handlers" "github.com/kandev/kandev/internal/worktree" @@ -449,6 +451,7 @@ type routeParams struct { devMode bool httpPort int features config.FeaturesConfig + voice config.VoiceConfig log *logger.Logger } @@ -698,6 +701,11 @@ func registerSecondaryRoutes( utilityhandlers.RegisterRoutes(p.router, p.utilityCtrl, p.lifecycleMgr, p.hostUtilityMgr, p.services.User, p.log) p.log.Debug("Registered Utility Agents handlers (HTTP)") + // Voice transcription fallback. The route always mounts, but returns 503 + // when no API key is configured so the frontend can hide the path. + voicehandlers.RegisterRoutes(p.router, transcribe.New(p.voice.OpenAIAPIKey), p.log) + p.log.Debug("Registered Voice handlers (HTTP)") + agentcapabilities.RegisterRoutes(p.router, p.hostUtilityMgr, p.log) p.log.Debug("Registered Agent Capabilities handlers (HTTP)") diff --git a/apps/backend/cmd/kandev/main.go b/apps/backend/cmd/kandev/main.go index fa3070382..4f771448c 100644 --- a/apps/backend/cmd/kandev/main.go +++ b/apps/backend/cmd/kandev/main.go @@ -1513,6 +1513,7 @@ func buildHTTPServer( devMode: cfg.Debug.DevMode || cfg.Debug.PprofEnabled, httpPort: port, features: cfg.Features, + voice: cfg.Voice, log: log, }) diff --git a/apps/backend/internal/common/config/config.go b/apps/backend/internal/common/config/config.go index 2d76bcca4..9028d3dce 100644 --- a/apps/backend/internal/common/config/config.go +++ b/apps/backend/internal/common/config/config.go @@ -41,6 +41,7 @@ type Config struct { RepoClone RepoCloneConfig `mapstructure:"repoClone"` Debug DebugConfig `mapstructure:"debug"` Office OfficeConfig `mapstructure:"office"` + Voice VoiceConfig `mapstructure:"voice"` Features FeaturesConfig `mapstructure:"features"` } @@ -147,6 +148,20 @@ type OfficeConfig struct { JWTSigningKey string `mapstructure:"jwtSigningKey"` } +// VoiceConfig holds configuration for the chat voice-input transcription +// fallback. The primary voice-input engine runs entirely in the browser +// (Web Speech API); this server-side fallback is only used when the browser +// has no SpeechRecognition support (e.g. Firefox). +// +// When OpenAIAPIKey is empty the /api/v1/transcribe endpoint returns 503 +// and the frontend hides the fallback path, so the feature is safe to +// ship un-configured. +type VoiceConfig struct { + // OpenAIAPIKey is the API key used to call OpenAI's Whisper transcription + // endpoint. Set via KANDEV_VOICE_OPENAI_API_KEY. + OpenAIAPIKey string `mapstructure:"openAIApiKey"` +} + // FeaturesConfig is the central registry of runtime feature flags. Every flag // defaults to false so production binaries ship with new work hidden until a // deployment explicitly opts in (env var, e.g. KANDEV_FEATURES_OFFICE=true). @@ -312,6 +327,9 @@ func setDefaults(v *viper.Viper) { // Office defaults v.SetDefault("office.jwtSigningKey", "") + // Voice defaults + v.SetDefault("voice.openAIApiKey", "") + // Feature-flag defaults live in ./features.yaml (symlinked to // apps/backend/internal/features/features.yaml). LoadWithPath applies // them via features.ApplyDefaults after this function returns so the @@ -428,6 +446,7 @@ func LoadWithPath(configPath string) (*Config, error) { _ = v.BindEnv("events.namespace", "KANDEV_EVENTS_NAMESPACE") _ = v.BindEnv("debug.devMode", "KANDEV_DEBUG_DEV_MODE") _ = v.BindEnv("debug.pprofEnabled", "KANDEV_DEBUG_PPROF_ENABLED") + _ = v.BindEnv("voice.openAIApiKey", "KANDEV_VOICE_OPENAI_API_KEY") // Configure config file v.SetConfigName("config") diff --git a/apps/backend/internal/user/controller/controller.go b/apps/backend/internal/user/controller/controller.go index 2b6cb8c8f..1ca49c98e 100644 --- a/apps/backend/internal/user/controller/controller.go +++ b/apps/backend/internal/user/controller/controller.go @@ -68,6 +68,7 @@ func (c *Controller) UpdateUserSettings(ctx context.Context, req dto.UpdateUserS TerminalFontFamily: req.TerminalFontFamily, TerminalFontSize: req.TerminalFontSize, ChangesPanelLayout: req.ChangesPanelLayout, + VoiceMode: req.VoiceMode, }) if err != nil { return dto.UserSettingsResponse{}, err diff --git a/apps/backend/internal/user/dto/dto.go b/apps/backend/internal/user/dto/dto.go index 450f11b2d..3329aeb1a 100644 --- a/apps/backend/internal/user/dto/dto.go +++ b/apps/backend/internal/user/dto/dto.go @@ -39,6 +39,7 @@ type UserSettingsDTO struct { TerminalFontFamily string `json:"terminal_font_family"` TerminalFontSize int `json:"terminal_font_size"` ChangesPanelLayout string `json:"changes_panel_layout"` + VoiceMode models.VoiceModeSettings `json:"voice_mode"` UpdatedAt string `json:"updated_at"` } @@ -82,6 +83,7 @@ type UpdateUserSettingsRequest struct { TerminalFontFamily *string `json:"terminal_font_family,omitempty"` TerminalFontSize *int `json:"terminal_font_size,omitempty"` ChangesPanelLayout *string `json:"changes_panel_layout,omitempty"` + VoiceMode *models.VoiceModeSettings `json:"voice_mode,omitempty"` } func FromUser(user *models.User) UserDTO { @@ -120,6 +122,7 @@ func FromUserSettings(settings *models.UserSettings) UserSettingsDTO { TerminalFontFamily: settings.TerminalFontFamily, TerminalFontSize: settings.TerminalFontSize, ChangesPanelLayout: settings.ChangesPanelLayout, + VoiceMode: settings.VoiceMode, UpdatedAt: settings.UpdatedAt.Format(time.RFC3339), } } diff --git a/apps/backend/internal/user/models/models.go b/apps/backend/internal/user/models/models.go index 4b48a5ff4..80475e904 100644 --- a/apps/backend/internal/user/models/models.go +++ b/apps/backend/internal/user/models/models.go @@ -38,10 +38,36 @@ type UserSettings struct { TerminalFontFamily string `json:"terminal_font_family"` TerminalFontSize int `json:"terminal_font_size"` ChangesPanelLayout string `json:"changes_panel_layout"` // "flat" | "tree" + VoiceMode VoiceModeSettings `json:"voice_mode"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` } +// VoiceModeSettings is the per-user configuration surface for the chat +// voice-input feature. Stored as a nested JSON object inside the `users.settings` +// blob — adding fields here does not require a schema migration. +type VoiceModeSettings struct { + // Enabled gates the whole feature. When false, the mic button is hidden + // entirely and no voice-related hooks run on the chat input. Defaults to + // true for new users; pre-existing user rows that have no `enabled` field + // in their stored JSON are also treated as enabled (see store layer). + Enabled bool `json:"enabled"` + // Engine is the user's preferred transcription engine. + // "auto" | "webSpeech" | "whisperWeb" | "whisperServer". Default "auto". + Engine string `json:"engine"` + // Language is the BCP-47 tag or "auto" to use the browser's language. + // Examples: "en-US", "pt-PT", "ja-JP". Default "auto". + Language string `json:"language"` + // Mode controls how the mic button is activated: "toggle" (click to start/stop) + // or "hold" (push-to-talk). Default "toggle". + Mode string `json:"mode"` + // AutoSend submits the chat message immediately after the transcript is inserted. + AutoSend bool `json:"auto_send"` + // WhisperWebModel selects the in-browser Whisper model when engine = whisperWeb. + // "tiny" | "base" | "small". Default "base". + WhisperWebModel string `json:"whisper_web_model"` +} + // SavedLayout represents a user-saved dockview layout configuration. type SavedLayout struct { ID string `json:"id"` diff --git a/apps/backend/internal/user/service/service.go b/apps/backend/internal/user/service/service.go index f83991a86..1a2709335 100644 --- a/apps/backend/internal/user/service/service.go +++ b/apps/backend/internal/user/service/service.go @@ -58,6 +58,7 @@ type UpdateUserSettingsRequest struct { TerminalFontFamily *string TerminalFontSize *int ChangesPanelLayout *string + VoiceMode *models.VoiceModeSettings } func NewService(repo store.Repository, eventBus bus.EventBus, log *logger.Logger) *Service { @@ -122,6 +123,9 @@ func (s *Service) UpdateUserSettings(ctx context.Context, req *UpdateUserSetting if err := applySidebarViews(settings, req); err != nil { return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error()) } + if err := applyVoiceMode(settings, req.VoiceMode); err != nil { + return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error()) + } settings.UpdatedAt = time.Now().UTC() if err := s.repo.UpsertUserSettings(ctx, settings); err != nil { return nil, err @@ -220,6 +224,66 @@ func applyChangesPanelLayout(settings *models.UserSettings, value *string) error return nil } +var ( + validVoiceEngines = map[string]struct{}{ + "auto": {}, + "webSpeech": {}, + "whisperWeb": {}, + "whisperServer": {}, + } + validVoiceModes = map[string]struct{}{ + "toggle": {}, + "hold": {}, + } + validWhisperWebModels = map[string]struct{}{ + "tiny": {}, + "base": {}, + "small": {}, + } +) + +// applyVoiceMode validates the inbound voice-mode settings and merges them +// onto the user record. Each sub-field is validated independently so a +// partial update (e.g. just `engine`) still works. +// +// `enabled` and `auto_send` are plain bools — every PATCH carries them. The +// settings UI always sends the full VoiceMode object so partial updates that +// would otherwise zero these are not a real concern. +func applyVoiceMode(settings *models.UserSettings, value *models.VoiceModeSettings) error { + if value == nil { + return nil + } + current := settings.VoiceMode + if current.Engine == "" { + current.Engine = "auto" + } + if value.Engine != "" { + if _, ok := validVoiceEngines[value.Engine]; !ok { + return errors.New("voice_mode.engine must be 'auto', 'webSpeech', 'whisperWeb', or 'whisperServer'") + } + current.Engine = value.Engine + } + if value.Language != "" { + current.Language = strings.TrimSpace(value.Language) + } + if value.Mode != "" { + if _, ok := validVoiceModes[value.Mode]; !ok { + return errors.New("voice_mode.mode must be 'toggle' or 'hold'") + } + current.Mode = value.Mode + } + if value.WhisperWebModel != "" { + if _, ok := validWhisperWebModels[value.WhisperWebModel]; !ok { + return errors.New("voice_mode.whisper_web_model must be 'tiny', 'base', or 'small'") + } + current.WhisperWebModel = value.WhisperWebModel + } + current.AutoSend = value.AutoSend + current.Enabled = value.Enabled + settings.VoiceMode = current + return nil +} + // applyChatSubmitKey validates and applies the chat_submit_key setting. func (s *Service) applyChatSubmitKey(settings *models.UserSettings, req *UpdateUserSettingsRequest) error { if req.ChatSubmitKey == nil { @@ -332,6 +396,7 @@ func (s *Service) publishUserSettingsEvent(ctx context.Context, settings *models "terminal_font_family": settings.TerminalFontFamily, "terminal_font_size": settings.TerminalFontSize, "changes_panel_layout": settings.ChangesPanelLayout, + "voice_mode": settings.VoiceMode, "updated_at": settings.UpdatedAt.Format(time.RFC3339), } if err := s.eventBus.Publish(ctx, events.UserSettingsUpdated, bus.NewEvent(events.UserSettingsUpdated, "user-service", data)); err != nil { diff --git a/apps/backend/internal/user/service/service_test.go b/apps/backend/internal/user/service/service_test.go index 1e3a1efdb..4a460d00e 100644 --- a/apps/backend/internal/user/service/service_test.go +++ b/apps/backend/internal/user/service/service_test.go @@ -441,3 +441,114 @@ func TestApplySidebarViews(t *testing.T) { }) } } + +func TestApplyVoiceMode(t *testing.T) { + t.Run("nil value leaves settings unchanged", func(t *testing.T) { + settings := &models.UserSettings{ + VoiceMode: models.VoiceModeSettings{Engine: "webSpeech", Language: "en-US"}, + } + if err := applyVoiceMode(settings, nil); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if settings.VoiceMode.Engine != "webSpeech" || settings.VoiceMode.Language != "en-US" { + t.Fatalf("expected unchanged, got %+v", settings.VoiceMode) + } + }) + + t.Run("happy path: applies a full update", func(t *testing.T) { + settings := &models.UserSettings{} + err := applyVoiceMode(settings, &models.VoiceModeSettings{ + Enabled: true, + Engine: "whisperWeb", + Language: "pt-PT", + Mode: "hold", + AutoSend: true, + WhisperWebModel: "small", + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + want := models.VoiceModeSettings{ + Enabled: true, + Engine: "whisperWeb", + Language: "pt-PT", + Mode: "hold", + AutoSend: true, + WhisperWebModel: "small", + } + if settings.VoiceMode != want { + t.Fatalf("expected %+v, got %+v", want, settings.VoiceMode) + } + }) + + t.Run("enabled=false is honored (user disabled the feature)", func(t *testing.T) { + settings := &models.UserSettings{VoiceMode: models.VoiceModeSettings{Enabled: true}} + if err := applyVoiceMode(settings, &models.VoiceModeSettings{Enabled: false}); err != nil { + t.Fatalf("unexpected error: %v", err) + } + if settings.VoiceMode.Enabled { + t.Fatalf("expected Enabled=false after disable, got true") + } + }) + + t.Run("invalid engine is rejected", func(t *testing.T) { + err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Engine: "bogus"}) + if err == nil || !strings.Contains(err.Error(), "voice_mode.engine") { + t.Fatalf("expected engine validation error, got %v", err) + } + }) + + t.Run("invalid mode is rejected", func(t *testing.T) { + err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Mode: "tap"}) + if err == nil || !strings.Contains(err.Error(), "voice_mode.mode") { + t.Fatalf("expected mode validation error, got %v", err) + } + }) + + t.Run("invalid whisper_web_model is rejected", func(t *testing.T) { + err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{WhisperWebModel: "huge"}) + if err == nil || !strings.Contains(err.Error(), "voice_mode.whisper_web_model") { + t.Fatalf("expected model validation error, got %v", err) + } + }) + + t.Run("partial update preserves string fields but zeroes booleans", func(t *testing.T) { + settings := &models.UserSettings{ + VoiceMode: models.VoiceModeSettings{ + Enabled: true, + Engine: "whisperServer", + Language: "en-GB", + Mode: "toggle", + AutoSend: true, + WhisperWebModel: "tiny", + }, + } + // Empty strings on the new payload mean "no change" for the string fields, + // but bools have no "unset" sentinel — every PATCH carries them. The settings + // UI always sends the full VoiceMode object so partial updates here would + // only happen in test or hand-crafted requests; the assertions below lock in + // that explicit behavior so it doesn't drift silently. + err := applyVoiceMode(settings, &models.VoiceModeSettings{Engine: "webSpeech"}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if settings.VoiceMode.Engine != "webSpeech" { + t.Fatalf("expected engine=webSpeech, got %q", settings.VoiceMode.Engine) + } + if settings.VoiceMode.Language != "en-GB" { + t.Fatalf("expected language preserved, got %q", settings.VoiceMode.Language) + } + if settings.VoiceMode.Mode != "toggle" { + t.Fatalf("expected mode preserved, got %q", settings.VoiceMode.Mode) + } + if settings.VoiceMode.WhisperWebModel != "tiny" { + t.Fatalf("expected whisper model preserved, got %q", settings.VoiceMode.WhisperWebModel) + } + if settings.VoiceMode.Enabled { + t.Fatalf("expected Enabled zeroed on partial update, got true") + } + if settings.VoiceMode.AutoSend { + t.Fatalf("expected AutoSend zeroed on partial update, got true") + } + }) +} diff --git a/apps/backend/internal/user/store/sqlite.go b/apps/backend/internal/user/store/sqlite.go index 954c9f93b..6671caf8c 100644 --- a/apps/backend/internal/user/store/sqlite.go +++ b/apps/backend/internal/user/store/sqlite.go @@ -162,6 +162,7 @@ func (r *sqliteRepository) UpsertUserSettings(ctx context.Context, settings *mod "terminal_font_family": settings.TerminalFontFamily, "terminal_font_size": settings.TerminalFontSize, "changes_panel_layout": settings.ChangesPanelLayout, + "voice_mode": settings.VoiceMode, }) if err != nil { return err @@ -192,6 +193,58 @@ func scanUser(scanner interface{ Scan(dest ...any) error }) (*models.User, error return user, nil } +// defaultVoiceModeSettings returns the baseline VoiceMode configuration for +// users with no saved preferences. Mirrored on the frontend; keep in sync. +func defaultVoiceModeSettings() models.VoiceModeSettings { + return models.VoiceModeSettings{ + Enabled: true, + Engine: "auto", + Language: "auto", + Mode: "toggle", + AutoSend: false, + WhisperWebModel: "base", + } +} + +// storedVoiceMode is the on-disk JSON shape — uses *bool for `enabled` so we +// can distinguish "absent" (older rows written before the toggle existed — +// must default to true) from "explicitly false" (user disabled the feature). +type storedVoiceMode struct { + Enabled *bool `json:"enabled"` + Engine string `json:"engine"` + Language string `json:"language"` + Mode string `json:"mode"` + AutoSend bool `json:"auto_send"` + WhisperWebModel string `json:"whisper_web_model"` +} + +// mergeVoiceModeDefaults fills in zero/missing fields on a stored VoiceMode +// payload so older user rows (written before VoiceMode existed) still produce +// usable settings instead of empty strings the frontend would reject. +func mergeVoiceModeDefaults(stored *storedVoiceMode) models.VoiceModeSettings { + out := defaultVoiceModeSettings() + if stored == nil { + return out + } + if stored.Enabled != nil { + out.Enabled = *stored.Enabled + } + if stored.Engine != "" { + out.Engine = stored.Engine + } + if stored.Language != "" { + out.Language = stored.Language + } + if stored.Mode != "" { + out.Mode = stored.Mode + } + if stored.WhisperWebModel != "" { + out.WhisperWebModel = stored.WhisperWebModel + } + out.AutoSend = stored.AutoSend + return out +} + func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID string) (*models.UserSettings, error) { settings := &models.UserSettings{} var settingsRaw string @@ -208,6 +261,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin settings.TerminalLinkBehavior = "new_tab" settings.ChangesPanelLayout = "flat" settings.SidebarViews = []models.SidebarView{} + settings.VoiceMode = defaultVoiceModeSettings() return settings, nil } var payload struct { @@ -235,6 +289,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin TerminalFontFamily string `json:"terminal_font_family"` TerminalFontSize int `json:"terminal_font_size"` ChangesPanelLayout string `json:"changes_panel_layout"` + VoiceMode *storedVoiceMode `json:"voice_mode"` } if err := json.Unmarshal([]byte(settingsRaw), &payload); err != nil { return nil, err @@ -294,6 +349,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin } settings.TerminalFontFamily = payload.TerminalFontFamily settings.TerminalFontSize = payload.TerminalFontSize + settings.VoiceMode = mergeVoiceModeDefaults(payload.VoiceMode) if payload.ChangesPanelLayout == "tree" { settings.ChangesPanelLayout = "tree" } else { diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers.go b/apps/backend/internal/voice/handlers/transcribe_handlers.go new file mode 100644 index 000000000..bb9992e3a --- /dev/null +++ b/apps/backend/internal/voice/handlers/transcribe_handlers.go @@ -0,0 +1,117 @@ +// Package handlers exposes the HTTP surface for the voice-input transcription +// fallback. The endpoint is unauthenticated (matches /api/v1/features) — the +// Web Speech API path is preferred by the frontend, so this server-side +// fallback only runs when the browser cannot do it locally. +package handlers + +import ( + "errors" + "io" + "net/http" + + "github.com/gin-gonic/gin" + "go.uber.org/zap" + + "github.com/kandev/kandev/internal/common/logger" + "github.com/kandev/kandev/internal/voice/transcribe" +) + +// maxAudioBytes caps the multipart audio payload. Whisper accepts up to 25 MB +// per request; we cap lower so a stuck mic doesn't blow up backend memory or +// burn API spend on a stuck recording. +const maxAudioBytes = 10 * 1024 * 1024 + +// Handlers wires the transcribe service into Gin routes. +type Handlers struct { + svc *transcribe.Service + log *logger.Logger +} + +// NewHandlers constructs a Handlers from a transcribe Service. +func NewHandlers(svc *transcribe.Service, log *logger.Logger) *Handlers { + return &Handlers{ + svc: svc, + log: log.WithFields(zap.String("component", "voice-handlers")), + } +} + +// RegisterRoutes mounts the voice transcription endpoint. +func RegisterRoutes(router *gin.Engine, svc *transcribe.Service, log *logger.Logger) { + h := NewHandlers(svc, log) + api := router.Group("/api/v1") + api.POST("/transcribe", h.httpTranscribe) +} + +func (h *Handlers) httpTranscribe(c *gin.Context) { + if h.svc == nil || !h.svc.Configured() { + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "voice transcription is not configured on this server", + }) + return + } + + // MaxBytesReader caps multipart parsing — once the cap is exceeded, Gin's + // multipart parser surfaces *http.MaxBytesError out of c.FormFile (because + // it reads the whole body through the wrapped reader before we ever get + // the *FileHeader). We need to distinguish that case from a genuinely + // missing field so the client sees 413 instead of a misleading 400. + c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, maxAudioBytes) + + fh, err := c.FormFile("audio") + if err != nil { + var maxBytesErr *http.MaxBytesError + if errors.As(err, &maxBytesErr) { + c.JSON(http.StatusRequestEntityTooLarge, gin.H{"error": "audio payload too large"}) + return + } + c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is required (multipart field 'audio')"}) + return + } + + file, err := fh.Open() + if err != nil { + h.log.Warn("open uploaded audio failed", zap.Error(err)) + c.JSON(http.StatusBadRequest, gin.H{"error": "cannot open uploaded audio"}) + return + } + defer func() { _ = file.Close() }() + + data, err := io.ReadAll(file) + if err != nil { + h.log.Warn("read uploaded audio failed", zap.Error(err)) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read uploaded audio"}) + return + } + if len(data) == 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is empty"}) + return + } + + mime := fh.Header.Get("Content-Type") + text, err := h.svc.Transcribe(c.Request.Context(), data, mime, fh.Filename) + if err != nil { + h.respondError(c, err) + return + } + c.JSON(http.StatusOK, gin.H{"text": text}) +} + +func (h *Handlers) respondError(c *gin.Context, err error) { + if errors.Is(err, transcribe.ErrNotConfigured) { + c.JSON(http.StatusServiceUnavailable, gin.H{ + "error": "voice transcription is not configured on this server", + }) + return + } + var upstream *transcribe.UpstreamError + if errors.As(err, &upstream) { + h.log.Warn("whisper upstream error", + zap.Int("status", upstream.StatusCode), + zap.String("body", upstream.Body), + ) + c.JSON(http.StatusBadGateway, gin.H{"error": "upstream transcription error"}) + return + } + h.log.Error("transcription failed", zap.Error(err)) + c.JSON(http.StatusInternalServerError, gin.H{"error": "transcription failed"}) +} diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers_test.go b/apps/backend/internal/voice/handlers/transcribe_handlers_test.go new file mode 100644 index 000000000..aa3170578 --- /dev/null +++ b/apps/backend/internal/voice/handlers/transcribe_handlers_test.go @@ -0,0 +1,157 @@ +package handlers + +import ( + "bytes" + "encoding/json" + "mime/multipart" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/gin-gonic/gin" + + "github.com/kandev/kandev/internal/common/logger" + "github.com/kandev/kandev/internal/voice/transcribe" +) + +func init() { + gin.SetMode(gin.TestMode) +} + +func testLogger(t *testing.T) *logger.Logger { + t.Helper() + log, err := logger.NewLogger(logger.LoggingConfig{Level: "error", Format: "text", OutputPath: "stderr"}) + if err != nil { + t.Fatalf("logger.NewLogger: %v", err) + } + return log +} + +func buildAudioRequest(t *testing.T, field, filename, mime string, data []byte) (*http.Request, string) { + t.Helper() + buf := &bytes.Buffer{} + w := multipart.NewWriter(buf) + if data != nil { + fw, err := createFormFile(w, field, filename, mime) + if err != nil { + t.Fatal(err) + } + _, _ = fw.Write(data) + } + _ = w.Close() + req := httptest.NewRequest(http.MethodPost, "/api/v1/transcribe", buf) + req.Header.Set("Content-Type", w.FormDataContentType()) + return req, w.FormDataContentType() +} + +func createFormFile(w *multipart.Writer, field, filename, mime string) (interface{ Write([]byte) (int, error) }, error) { + if mime == "" { + return w.CreateFormFile(field, filename) + } + hdr := make(map[string][]string) + hdr["Content-Disposition"] = []string{"form-data; name=\"" + field + "\"; filename=\"" + filename + "\""} + hdr["Content-Type"] = []string{mime} + return w.CreatePart(hdr) +} + +func newRouter(svc *transcribe.Service, t *testing.T) *gin.Engine { + r := gin.New() + RegisterRoutes(r, svc, testLogger(t)) + return r +} + +func TestTranscribe_NotConfigured(t *testing.T) { + svc := transcribe.New("") + r := newRouter(svc, t) + + req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte("hello")) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Fatalf("status = %d, want 503; body=%s", w.Code, w.Body.String()) + } +} + +func TestTranscribe_MissingFile(t *testing.T) { + svc := transcribe.New("sk-test") + r := newRouter(svc, t) + + // No file part — just an empty form. + buf := &bytes.Buffer{} + w := multipart.NewWriter(buf) + _ = w.Close() + req := httptest.NewRequest(http.MethodPost, "/api/v1/transcribe", buf) + req.Header.Set("Content-Type", w.FormDataContentType()) + + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want 400; body=%s", rr.Code, rr.Body.String()) + } +} + +func TestTranscribe_EmptyAudio(t *testing.T) { + svc := transcribe.New("sk-test") + r := newRouter(svc, t) + + req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte{}) + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + if rr.Code != http.StatusBadRequest { + t.Fatalf("status = %d, want 400; body=%s", rr.Code, rr.Body.String()) + } +} + +func TestTranscribe_Success(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Header.Get("Authorization") != "Bearer sk-test" { + t.Errorf("auth header missing") + } + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"text":"transcribed"}`)) + })) + defer upstream.Close() + + svc := transcribe.New("sk-test", transcribe.WithEndpoint(upstream.URL)) + r := newRouter(svc, t) + + req, _ := buildAudioRequest(t, "audio", "clip.webm", "audio/webm", []byte("bytes")) + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("status = %d, want 200; body=%s", rr.Code, rr.Body.String()) + } + var body struct { + Text string `json:"text"` + } + if err := json.Unmarshal(rr.Body.Bytes(), &body); err != nil { + t.Fatal(err) + } + if body.Text != "transcribed" { + t.Errorf("text = %q", body.Text) + } +} + +func TestTranscribe_UpstreamError(t *testing.T) { + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + _, _ = w.Write([]byte(`{"error":"oops"}`)) + })) + defer upstream.Close() + + svc := transcribe.New("sk-test", transcribe.WithEndpoint(upstream.URL)) + r := newRouter(svc, t) + req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte("bytes")) + rr := httptest.NewRecorder() + r.ServeHTTP(rr, req) + + if rr.Code != http.StatusBadGateway { + t.Fatalf("status = %d, want 502; body=%s", rr.Code, rr.Body.String()) + } + if !strings.Contains(rr.Body.String(), "upstream") { + t.Errorf("body should mention upstream: %s", rr.Body.String()) + } +} diff --git a/apps/backend/internal/voice/transcribe/service.go b/apps/backend/internal/voice/transcribe/service.go new file mode 100644 index 000000000..fad7b13b9 --- /dev/null +++ b/apps/backend/internal/voice/transcribe/service.go @@ -0,0 +1,185 @@ +// Package transcribe wraps the OpenAI Whisper transcription endpoint for the +// chat voice-input fallback. The browser's Web Speech API is the primary +// voice-input engine; this server-side path is only hit when the browser +// has no SpeechRecognition support. +package transcribe + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "mime/multipart" + "net/http" + "net/textproto" + "strings" + "time" +) + +// ErrNotConfigured is returned when the service has no API key — the handler +// maps this to HTTP 503 so the frontend can hide the Whisper fallback path +// instead of repeatedly retrying a deployment that will never succeed. +var ErrNotConfigured = errors.New("voice transcription is not configured") + +// UpstreamError wraps a non-2xx response from OpenAI so the handler can map +// it to HTTP 502 and surface a clean error to the caller. +type UpstreamError struct { + StatusCode int + Body string +} + +func (e *UpstreamError) Error() string { + return fmt.Sprintf("openai whisper upstream error: status=%d body=%s", e.StatusCode, e.Body) +} + +const ( + defaultEndpoint = "https://api.openai.com/v1/audio/transcriptions" + defaultModel = "whisper-1" + defaultTimeout = 60 * time.Second +) + +// Service transcribes audio via OpenAI's Whisper endpoint. +type Service struct { + apiKey string + endpoint string + model string + client *http.Client +} + +// Option customises a Service for tests (custom endpoint, HTTP client). +type Option func(*Service) + +// WithEndpoint overrides the upstream URL — used by tests with httptest servers. +func WithEndpoint(url string) Option { + return func(s *Service) { s.endpoint = url } +} + +// WithHTTPClient overrides the HTTP client. +func WithHTTPClient(c *http.Client) Option { + return func(s *Service) { s.client = c } +} + +// WithModel overrides the Whisper model name. +func WithModel(model string) Option { + return func(s *Service) { s.model = model } +} + +// New constructs a Service. apiKey may be empty; in that case Transcribe +// returns ErrNotConfigured without making any network calls. +func New(apiKey string, opts ...Option) *Service { + s := &Service{ + apiKey: apiKey, + endpoint: defaultEndpoint, + model: defaultModel, + client: &http.Client{Timeout: defaultTimeout}, + } + for _, o := range opts { + o(s) + } + return s +} + +// Configured reports whether the service has an API key. Used by handlers +// to short-circuit before reading the request body. +func (s *Service) Configured() bool { + return s != nil && strings.TrimSpace(s.apiKey) != "" +} + +// Transcribe sends the given audio bytes to OpenAI Whisper and returns the +// transcribed text. filename is used for the multipart Content-Disposition; +// Whisper relies on the file extension to detect the audio format. +func (s *Service) Transcribe(ctx context.Context, audio []byte, mimeType, filename string) (string, error) { + if !s.Configured() { + return "", ErrNotConfigured + } + if len(audio) == 0 { + return "", errors.New("audio payload is empty") + } + + body, contentType, err := buildMultipart(audio, mimeType, filename, s.model) + if err != nil { + return "", fmt.Errorf("build multipart body: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, s.endpoint, body) + if err != nil { + return "", fmt.Errorf("build whisper request: %w", err) + } + req.Header.Set("Authorization", "Bearer "+s.apiKey) + req.Header.Set("Content-Type", contentType) + req.Header.Set("Accept", "application/json") + + resp, err := s.client.Do(req) + if err != nil { + return "", fmt.Errorf("call whisper endpoint: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + rawBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return "", &UpstreamError{StatusCode: resp.StatusCode, Body: string(rawBody)} + } + + var parsed struct { + Text string `json:"text"` + } + if err := json.Unmarshal(rawBody, &parsed); err != nil { + return "", fmt.Errorf("decode whisper response: %w", err) + } + return strings.TrimSpace(parsed.Text), nil +} + +// buildMultipart assembles the multipart/form-data body Whisper expects: +// `file`, `model`, and `response_format=json`. +func buildMultipart(audio []byte, mimeType, filename, model string) (io.Reader, string, error) { + buf := &bytes.Buffer{} + w := multipart.NewWriter(buf) + + if filename == "" { + filename = "recording" + extensionForMime(mimeType) + } + header := textproto.MIMEHeader{} + header.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename=%q`, filename)) + if mimeType != "" { + header.Set("Content-Type", mimeType) + } + filePart, err := w.CreatePart(header) + if err != nil { + return nil, "", err + } + if _, err := filePart.Write(audio); err != nil { + return nil, "", err + } + + if err := w.WriteField("model", model); err != nil { + return nil, "", err + } + if err := w.WriteField("response_format", "json"); err != nil { + return nil, "", err + } + if err := w.Close(); err != nil { + return nil, "", err + } + return buf, w.FormDataContentType(), nil +} + +// extensionForMime maps the audio MIME types MediaRecorder commonly emits to +// the file extensions Whisper recognises. Default to ".webm" — supported by +// Whisper and the most common MediaRecorder default on Chrome. +func extensionForMime(mime string) string { + mime = strings.ToLower(mime) + switch { + case strings.Contains(mime, "wav"): + return ".wav" + case strings.Contains(mime, "mp4"), strings.Contains(mime, "m4a"): + return ".m4a" + case strings.Contains(mime, "mpeg"), strings.Contains(mime, "mp3"): + return ".mp3" + case strings.Contains(mime, "ogg"): + return ".ogg" + default: + return ".webm" + } +} diff --git a/apps/backend/internal/voice/transcribe/service_test.go b/apps/backend/internal/voice/transcribe/service_test.go new file mode 100644 index 000000000..b8c8315cf --- /dev/null +++ b/apps/backend/internal/voice/transcribe/service_test.go @@ -0,0 +1,222 @@ +package transcribe + +import ( + "context" + "errors" + "io" + "mime/multipart" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestService_Transcribe_NotConfigured(t *testing.T) { + svc := New("") + _, err := svc.Transcribe(context.Background(), []byte("data"), "audio/webm", "") + if !errors.Is(err, ErrNotConfigured) { + t.Fatalf("expected ErrNotConfigured, got %v", err) + } +} + +func TestService_Configured(t *testing.T) { + if New("").Configured() { + t.Errorf("empty key should not be configured") + } + if New(" ").Configured() { + t.Errorf("whitespace-only key should not be configured") + } + if !New("sk-test").Configured() { + t.Errorf("non-empty key should be configured") + } +} + +func TestService_Transcribe_EmptyAudio(t *testing.T) { + svc := New("sk-test") + _, err := svc.Transcribe(context.Background(), nil, "audio/webm", "") + if err == nil { + t.Fatal("expected error for empty audio") + } +} + +func TestService_Transcribe_Success(t *testing.T) { + var capturedAuth string + var capturedFilename string + var capturedFileBytes []byte + var capturedModel string + var capturedFormat string + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedAuth = r.Header.Get("Authorization") + if err := r.ParseMultipartForm(32 << 20); err != nil { + t.Errorf("parse multipart: %v", err) + } + capturedModel = r.FormValue("model") + capturedFormat = r.FormValue("response_format") + // Use Errorf + return inside the HTTP handler goroutine — t.Fatalf + // from a non-test goroutine triggers FailNow which panics rather than + // failing the test cleanly. + fh := r.MultipartForm.File["file"] + if len(fh) != 1 { + t.Errorf("expected 1 file part, got %d", len(fh)) + return + } + capturedFilename = fh[0].Filename + f, err := fh[0].Open() + if err != nil { + t.Errorf("open file: %v", err) + return + } + defer func() { _ = f.Close() }() + capturedFileBytes, _ = io.ReadAll(f) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"text":"hello world"}`)) + })) + defer srv.Close() + + svc := New("sk-test", WithEndpoint(srv.URL)) + text, err := svc.Transcribe(context.Background(), []byte("audio-bytes"), "audio/webm", "clip.webm") + if err != nil { + t.Fatalf("Transcribe failed: %v", err) + } + if text != "hello world" { + t.Errorf("unexpected text: %q", text) + } + if capturedAuth != "Bearer sk-test" { + t.Errorf("auth header = %q", capturedAuth) + } + if capturedModel != defaultModel { + t.Errorf("model = %q", capturedModel) + } + if capturedFormat != "json" { + t.Errorf("response_format = %q", capturedFormat) + } + if capturedFilename != "clip.webm" { + t.Errorf("filename = %q", capturedFilename) + } + if string(capturedFileBytes) != "audio-bytes" { + t.Errorf("file body = %q", string(capturedFileBytes)) + } +} + +func TestService_Transcribe_DerivedFilename(t *testing.T) { + var capturedFilename string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = r.ParseMultipartForm(32 << 20) + fh := r.MultipartForm.File["file"] + if len(fh) == 1 { + capturedFilename = fh[0].Filename + } + _, _ = w.Write([]byte(`{"text":""}`)) + })) + defer srv.Close() + + svc := New("sk-test", WithEndpoint(srv.URL)) + _, err := svc.Transcribe(context.Background(), []byte("a"), "audio/wav", "") + if err != nil { + t.Fatal(err) + } + if !strings.HasSuffix(capturedFilename, ".wav") { + t.Errorf("derived filename should use .wav for audio/wav, got %q", capturedFilename) + } +} + +func TestService_Transcribe_UpstreamError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"bad audio"}`)) + })) + defer srv.Close() + + svc := New("sk-test", WithEndpoint(srv.URL)) + _, err := svc.Transcribe(context.Background(), []byte("a"), "audio/webm", "") + var upstream *UpstreamError + if !errors.As(err, &upstream) { + t.Fatalf("expected UpstreamError, got %T: %v", err, err) + } + if upstream.StatusCode != http.StatusBadRequest { + t.Errorf("status = %d", upstream.StatusCode) + } + if !strings.Contains(upstream.Body, "bad audio") { + t.Errorf("body did not contain upstream payload: %q", upstream.Body) + } +} + +func TestExtensionForMime(t *testing.T) { + cases := map[string]string{ + "audio/webm": ".webm", + "audio/wav": ".wav", + "audio/x-wav": ".wav", + "audio/mp4": ".m4a", + "audio/m4a": ".m4a", + "audio/mpeg": ".mp3", + "audio/mp3": ".mp3", + "audio/ogg": ".ogg", + "": ".webm", + "application/anything": ".webm", + } + for mime, want := range cases { + if got := extensionForMime(mime); got != want { + t.Errorf("extensionForMime(%q) = %q, want %q", mime, got, want) + } + } +} + +func TestBuildMultipart_Roundtrip(t *testing.T) { + body, ct, err := buildMultipart([]byte("hello"), "audio/wav", "a.wav", "whisper-1") + if err != nil { + t.Fatal(err) + } + // Parse the multipart body back out using the boundary embedded in ct. + mediaType, params, ok := splitContentType(ct) + if !ok || mediaType != "multipart/form-data" { + t.Fatalf("unexpected content-type: %q", ct) + } + mr := multipart.NewReader(body, params["boundary"]) + fields := map[string]string{} + var fileContent string + for { + part, err := mr.NextPart() + if err == io.EOF { + break + } + if err != nil { + t.Fatal(err) + } + buf, _ := io.ReadAll(part) + if part.FileName() != "" { + fileContent = string(buf) + } else { + fields[part.FormName()] = string(buf) + } + } + if fileContent != "hello" { + t.Errorf("file part = %q", fileContent) + } + if fields["model"] != "whisper-1" { + t.Errorf("model field = %q", fields["model"]) + } + if fields["response_format"] != "json" { + t.Errorf("response_format field = %q", fields["response_format"]) + } +} + +// splitContentType is a tiny helper to split "multipart/form-data; boundary=…" +// without pulling in mime.ParseMediaType — keeps this test file self-contained. +func splitContentType(ct string) (string, map[string]string, bool) { + parts := strings.SplitN(ct, ";", 2) + if len(parts) != 2 { + return "", nil, false + } + mediaType := strings.TrimSpace(parts[0]) + params := map[string]string{} + for _, kv := range strings.Split(parts[1], ";") { + kv = strings.TrimSpace(kv) + eq := strings.IndexByte(kv, '=') + if eq < 0 { + continue + } + params[kv[:eq]] = strings.Trim(kv[eq+1:], `"`) + } + return mediaType, params, true +} diff --git a/apps/pnpm-lock.yaml b/apps/pnpm-lock.yaml index e773c9190..8120436bf 100644 --- a/apps/pnpm-lock.yaml +++ b/apps/pnpm-lock.yaml @@ -246,6 +246,9 @@ importers: '@dnd-kit/utilities': specifier: ^3.2.2 version: 3.2.2(react@19.2.3) + '@huggingface/transformers': + specifier: ^4.2.0 + version: 4.2.0 '@kandev/theme': specifier: workspace:* version: link:../packages/theme @@ -419,7 +422,7 @@ importers: version: 0.55.1 next: specifier: 16.1.7 - version: 16.1.7(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3) + version: 16.1.7(@babel/core@7.28.6)(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3) next-themes: specifier: ^0.4.6 version: 0.4.6(react-dom@19.2.3(react@19.2.3))(react@19.2.3) @@ -473,7 +476,7 @@ importers: version: 2.0.7(react-dom@19.2.3(react@19.2.3))(react@19.2.3) styled-jsx: specifier: 5.1.6 - version: 5.1.6(react@19.2.3) + version: 5.1.6(@babel/core@7.28.6)(react@19.2.3) tailwind-merge: specifier: ^3.4.0 version: 3.4.0 @@ -1406,6 +1409,16 @@ packages: peerDependencies: hono: ^4 + '@huggingface/jinja@0.5.9': + resolution: {integrity: sha512-uWTG+l3VJRsl7EXxYizuL3P+cCPoc3cRqbWWRcQN0FhejRfbdq0RNhCmbY/YDtnTcz9icdLYuLDjsnz4d8JMuw==} + engines: {node: '>=18'} + + '@huggingface/tokenizers@0.1.3': + resolution: {integrity: sha512-8rF/RRT10u+kn7YuUbUg0OF30K8rjTc78aHpxT+qJ1uWSqxT1MHi8+9ltwYfkFYJzT/oS+qw3JVfHtNMGAdqyA==} + + '@huggingface/transformers@4.2.0': + resolution: {integrity: sha512-8BRCoBMH0XsWaEIamuR0LrJGAfftgHAfb2Vrffy0VKlSAE/MnUJ5/h/zTfEP3fDIft+nk7TqB8xXEyABGitBjQ==} + '@humanfs/core@0.19.1': resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==} engines: {node: '>=18.18.0'} @@ -1845,6 +1858,36 @@ packages: engines: {node: '>=18'} hasBin: true + '@protobufjs/aspromise@1.1.2': + resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} + + '@protobufjs/base64@1.1.2': + resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} + + '@protobufjs/codegen@2.0.5': + resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==} + + '@protobufjs/eventemitter@1.1.1': + resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==} + + '@protobufjs/fetch@1.1.1': + resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==} + + '@protobufjs/float@1.0.2': + resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} + + '@protobufjs/inquire@1.1.2': + resolution: {integrity: sha512-pa0vFRuws4wkvaXKK1uXZMAwAX4/t8ANaJo45iw/oQHNQ9q5xUzwgFmVJGXiga2BeN+zpX7Vf9vmsiIa2J+MUw==} + + '@protobufjs/path@1.1.2': + resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} + + '@protobufjs/pool@1.1.0': + resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} + + '@protobufjs/utf8@1.1.1': + resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==} + '@radix-ui/number@1.1.1': resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==} @@ -3423,6 +3466,7 @@ packages: '@ungap/structured-clone@1.3.0': resolution: {integrity: sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==} + deprecated: Potential CWE-502 - Update to 1.3.1 or higher '@unrs/resolver-binding-android-arm-eabi@1.11.1': resolution: {integrity: sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==} @@ -3575,6 +3619,10 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + adm-zip@0.5.17: + resolution: {integrity: sha512-+Ut8d9LLqwEvHHJl1+PIHqoyDxFgVN847JTVM3Izi3xHDWPE4UtzzXysMZQs64DMcrJfBeS/uoEP4AD3HQHnQQ==} + engines: {node: '>=12.0'} + agent-base@7.1.4: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} @@ -3707,6 +3755,10 @@ packages: resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} engines: {node: '>=18'} + boolean@3.2.0: + resolution: {integrity: sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==} + deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. + brace-expansion@1.1.12: resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==} @@ -4226,6 +4278,9 @@ packages: detect-node-es@1.1.0: resolution: {integrity: sha512-ypdmJU/TbBby2Dxibuv7ZLW3Bs1QEmM7nHjEANfohJLvE0XVujisn1qPJcZxg+qDucsr+bP6fLD1rPS3AhJ7EQ==} + detect-node@2.1.0: + resolution: {integrity: sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==} + devlop@1.1.0: resolution: {integrity: sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==} @@ -4367,6 +4422,9 @@ packages: resolution: {integrity: sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==} engines: {node: '>= 0.4'} + es6-error@4.1.1: + resolution: {integrity: sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg==} + esbuild@0.21.5: resolution: {integrity: sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==} engines: {node: '>=12'} @@ -4642,6 +4700,9 @@ packages: resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==} engines: {node: '>=16'} + flatbuffers@25.9.23: + resolution: {integrity: sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==} + flatted@3.3.3: resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==} @@ -4765,6 +4826,10 @@ packages: resolution: {integrity: sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==} engines: {node: '>=10.13.0'} + global-agent@3.0.0: + resolution: {integrity: sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==} + engines: {node: '>=10.0'} + global-directory@4.0.1: resolution: {integrity: sha512-wHTUcDUoZ1H5/0iVqEudYW4/kAlN5cZ3j/bXn0Dpbizl9iaUVeWSHqiOjsgk6OW2bkLclbBjzewBz6weQ1zA2Q==} engines: {node: '>=18'} @@ -4792,6 +4857,9 @@ packages: resolution: {integrity: sha512-DKKrynuQRne0PNpEbzuEdHlYOMksHSUI8Zc9Unei5gTsMNA2/vMpoMz/yKba50pejK56qj98qM0SjYxAKi13gQ==} engines: {node: ^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0} + guid-typescript@1.0.9: + resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} + hachure-fill@0.5.2: resolution: {integrity: sha512-3GKBOn+m2LX9iq+JC1064cSFprJY4jL1jCXTcpnfER5HYE2l/4EfWSGzkPa/ZDBmYI0ZOEj5VHV/eKnPGkHuOg==} @@ -5200,6 +5268,9 @@ packages: json-stable-stringify-without-jsonify@1.0.1: resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==} + json-stringify-safe@5.0.1: + resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==} + json5@1.0.2: resolution: {integrity: sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==} hasBin: true @@ -5385,6 +5456,9 @@ packages: resolution: {integrity: sha512-i24m8rpwhmPIS4zscNzK6MSEhk0DUWa/8iYQWxhffV8jkI4Phvs3F+quL5xvS0gdQR0FyTCMMH33Y78dDTzzIw==} engines: {node: '>=18'} + long@5.3.2: + resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==} + longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} @@ -5431,6 +5505,10 @@ packages: engines: {node: '>= 20'} hasBin: true + matcher@3.0.0: + resolution: {integrity: sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==} + engines: {node: '>=10'} + math-intrinsics@1.1.0: resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==} engines: {node: '>= 0.4'} @@ -5798,6 +5876,19 @@ packages: oniguruma-to-es@4.3.4: resolution: {integrity: sha512-3VhUGN3w2eYxnTzHn+ikMI+fp/96KoRSVK9/kMTcFqj1NRDh2IhQCKvYxDnWePKRXY/AqH+Fuiyb7VHSzBjHfA==} + onnxruntime-common@1.24.0-dev.20251116-b39e144322: + resolution: {integrity: sha512-BOoomdHYmNRL5r4iQ4bMvsl2t0/hzVQ3OM3PHD0gxeXu1PmggqBv3puZicEUVOA3AtHHYmqZtjMj9FOfGrATTw==} + + onnxruntime-common@1.24.3: + resolution: {integrity: sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA==} + + onnxruntime-node@1.24.3: + resolution: {integrity: sha512-JH7+czbc8ALA819vlTgcV+Q214/+VjGeBHDjX81+ZCD0PCVCIFGFNtT0V4sXG/1JXypKPgScQcB3ij/hk3YnTg==} + os: [win32, darwin, linux] + + onnxruntime-web@1.26.0-dev.20260416-b7804b056c: + resolution: {integrity: sha512-MD6Ss4GSpQBo6zqoJzyT9LRbKYs7x/JVN23FT24EcEvlqF4VuzPOeH6X38orZPKHQDbprn7K+SBpu0/mj2CQiw==} + open@11.0.0: resolution: {integrity: sha512-smsWv2LzFjP03xmvFoJ331ss6h+jixfA4UUV/Bsiyuu4YJPfN+FIQGOIiv4w9/+MoHkfkJ22UIaQWRVFRfH6Vw==} engines: {node: '>=20'} @@ -5911,6 +6002,9 @@ packages: pkg-types@1.3.1: resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==} + platform@1.3.6: + resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} + playwright-core@1.58.2: resolution: {integrity: sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==} engines: {node: '>=18'} @@ -6036,6 +6130,10 @@ packages: prosemirror-view@1.41.5: resolution: {integrity: sha512-UDQbIPnDrjE8tqUBbPmCOZgtd75htE6W3r0JCmY9bL6W1iemDM37MZEKC49d+tdQ0v/CKx4gjxLoLsfkD2NiZA==} + protobufjs@7.6.1: + resolution: {integrity: sha512-4K0myLaWL5EteuSAro91EGFgcfVgxb64Jx+7oDAY6GOkXD4M69yuSEljNcInGVCA5sOPxmZ/EqDLj2x0Q0+Ygg==} + engines: {node: '>=12.0.0'} + proxy-addr@2.0.7: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} @@ -6170,6 +6268,7 @@ packages: recharts@2.15.4: resolution: {integrity: sha512-UT/q6fwS3c1dHbXv2uFgYJ9BMFHu3fwnd7AYZaEQhXuYQ4hgsxLvsUXzGdKeZrW5xopzDCvuA2N41WJ88I7zIw==} engines: {node: '>=14'} + deprecated: 1.x and 2.x branches are no longer active. Bump to Recharts v3 to receive latest features and bugfixes. See https://github.com/recharts/recharts/wiki/3.0-migration-guide peerDependencies: react: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 react-dom: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 @@ -6265,6 +6364,10 @@ packages: resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} + roarr@2.15.4: + resolution: {integrity: sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==} + engines: {node: '>=8.0'} + robust-predicates@3.0.2: resolution: {integrity: sha512-IXgzBWvWQwE6PrDI05OvmXUIruQTcoMDzRsOd5CDvHCVLcLHMTSYvOK5Cm46kWqlV3yAbuSpBZdJ5oP5OUoStg==} @@ -6315,6 +6418,9 @@ packages: resolution: {integrity: sha512-3A6sD0WYP7+QrjbfNA2FN3FsOaGGFoekCVgTyypy53gPxhbkCIjtO6YWgdrfM+n/8sI8JeXZOIxsHjMTNxQ4nQ==} engines: {node: ^14.0.0 || >=16.0.0} + semver-compare@1.0.0: + resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==} + semver@6.3.1: resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==} hasBin: true @@ -6328,6 +6434,10 @@ packages: resolution: {integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==} engines: {node: '>= 18'} + serialize-error@7.0.1: + resolution: {integrity: sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==} + engines: {node: '>=10'} + serve-static@2.2.1: resolution: {integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==} engines: {node: '>= 18'} @@ -6416,6 +6526,9 @@ packages: resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==} engines: {node: '>= 10.x'} + sprintf-js@1.1.3: + resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==} + stable-hash@0.0.5: resolution: {integrity: sha512-+L3ccpzibovGXFK+Ap/f8LOS0ahMrHTf3xu7mMLSpEGU0EO9ucaysSylKo9eRDFNhWve/y275iPmIZ4z39a9iA==} @@ -6664,6 +6777,10 @@ packages: resolution: {integrity: sha512-Acylog8/luQ8L7il+geoSxhEkazvkslg7PSNKOX59mbB9cOveP5aq9h74Y7YU8yDpJwetzQQrfIwtf4Wp4LKcw==} engines: {node: '>=4'} + type-fest@0.13.1: + resolution: {integrity: sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==} + engines: {node: '>=10'} + type-fest@5.4.0: resolution: {integrity: sha512-wfkA6r0tBpVfGiyO+zbf9e10QkRQSlK9F2UvyfnjoCmrvH2bjHyhPzhugSBOuq1dog3P0+FKckqe+Xf6WKVjwg==} engines: {node: '>=20'} @@ -7914,6 +8031,18 @@ snapshots: dependencies: hono: 4.11.3 + '@huggingface/jinja@0.5.9': {} + + '@huggingface/tokenizers@0.1.3': {} + + '@huggingface/transformers@4.2.0': + dependencies: + '@huggingface/jinja': 0.5.9 + '@huggingface/tokenizers': 0.1.3 + onnxruntime-node: 1.24.3 + onnxruntime-web: 1.26.0-dev.20260416-b7804b056c + sharp: 0.34.5 + '@humanfs/core@0.19.1': {} '@humanfs/node@0.16.7': @@ -7933,8 +8062,7 @@ snapshots: '@iconify/types': 2.0.0 mlly: 1.8.0 - '@img/colour@1.1.0': - optional: true + '@img/colour@1.1.0': {} '@img/sharp-darwin-arm64@0.34.5': optionalDependencies: @@ -8320,6 +8448,28 @@ snapshots: dependencies: playwright: 1.58.2 + '@protobufjs/aspromise@1.1.2': {} + + '@protobufjs/base64@1.1.2': {} + + '@protobufjs/codegen@2.0.5': {} + + '@protobufjs/eventemitter@1.1.1': {} + + '@protobufjs/fetch@1.1.1': + dependencies: + '@protobufjs/aspromise': 1.1.2 + + '@protobufjs/float@1.0.2': {} + + '@protobufjs/inquire@1.1.2': {} + + '@protobufjs/path@1.1.2': {} + + '@protobufjs/pool@1.1.0': {} + + '@protobufjs/utf8@1.1.1': {} + '@radix-ui/number@1.1.1': {} '@radix-ui/primitive@1.1.3': {} @@ -10077,6 +10227,8 @@ snapshots: acorn@8.15.0: {} + adm-zip@0.5.17: {} + agent-base@7.1.4: {} ajv-formats@3.0.1(ajv@8.17.1): @@ -10230,6 +10382,8 @@ snapshots: transitivePeerDependencies: - supports-color + boolean@3.2.0: {} + brace-expansion@1.1.12: dependencies: balanced-match: 1.0.2 @@ -10742,6 +10896,8 @@ snapshots: detect-node-es@1.1.0: {} + detect-node@2.1.0: {} + devlop@1.1.0: dependencies: dequal: 2.0.3 @@ -10943,6 +11099,8 @@ snapshots: is-date-object: 1.1.0 is-symbol: 1.1.1 + es6-error@4.1.1: {} + esbuild@0.21.5: optionalDependencies: '@esbuild/aix-ppc64': 0.21.5 @@ -11039,8 +11197,8 @@ snapshots: '@next/eslint-plugin-next': 16.1.1 eslint: 9.39.2(jiti@2.6.1) eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) - eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)) + eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)) eslint-plugin-jsx-a11y: 6.10.2(eslint@9.39.2(jiti@2.6.1)) eslint-plugin-react: 7.37.5(eslint@9.39.2(jiti@2.6.1)) eslint-plugin-react-hooks: 7.0.1(eslint@9.39.2(jiti@2.6.1)) @@ -11062,7 +11220,7 @@ snapshots: transitivePeerDependencies: - supports-color - eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)): + eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)): dependencies: '@nolyfill/is-core-module': 1.0.39 debug: 4.4.3 @@ -11073,22 +11231,22 @@ snapshots: tinyglobby: 0.2.15 unrs-resolver: 1.11.1 optionalDependencies: - eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)) transitivePeerDependencies: - supports-color - eslint-module-utils@2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)): + eslint-module-utils@2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)): dependencies: debug: 3.2.7 optionalDependencies: '@typescript-eslint/parser': 8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3) eslint: 9.39.2(jiti@2.6.1) eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)) transitivePeerDependencies: - supports-color - eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)): + eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)): dependencies: '@rtsao/scc': 1.1.0 array-includes: 3.1.9 @@ -11099,7 +11257,7 @@ snapshots: doctrine: 2.1.0 eslint: 9.39.2(jiti@2.6.1) eslint-import-resolver-node: 0.3.9 - eslint-module-utils: 2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)) + eslint-module-utils: 2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)) hasown: 2.0.2 is-core-module: 2.16.1 is-glob: 4.0.3 @@ -11425,6 +11583,8 @@ snapshots: flatted: 3.3.3 keyv: 4.5.4 + flatbuffers@25.9.23: {} + flatted@3.3.3: {} for-each@0.3.5: @@ -11537,6 +11697,15 @@ snapshots: dependencies: is-glob: 4.0.3 + global-agent@3.0.0: + dependencies: + boolean: 3.2.0 + es6-error: 4.1.1 + matcher: 3.0.0 + roarr: 2.15.4 + semver: 7.7.4 + serialize-error: 7.0.1 + global-directory@4.0.1: dependencies: ini: 4.1.1 @@ -11556,6 +11725,8 @@ snapshots: graphql@16.12.0: {} + guid-typescript@1.0.9: {} + hachure-fill@0.5.2: {} happy-dom@20.8.9: @@ -11974,6 +12145,8 @@ snapshots: json-stable-stringify-without-jsonify@1.0.1: {} + json-stringify-safe@5.0.1: {} + json5@1.0.2: dependencies: minimist: 1.2.8 @@ -12127,6 +12300,8 @@ snapshots: chalk: 5.6.2 is-unicode-supported: 1.3.0 + long@5.3.2: {} + longest-streak@3.1.0: {} loose-envify@1.4.0: @@ -12172,6 +12347,10 @@ snapshots: marked@16.4.2: {} + matcher@3.0.0: + dependencies: + escape-string-regexp: 4.0.0 + math-intrinsics@1.1.0: {} mdast-util-find-and-replace@3.0.2: @@ -12655,7 +12834,7 @@ snapshots: react: 19.2.3 react-dom: 19.2.3(react@19.2.3) - next@16.1.7(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3): + next@16.1.7(@babel/core@7.28.6)(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3): dependencies: '@next/env': 16.1.7 '@swc/helpers': 0.5.15 @@ -12664,7 +12843,7 @@ snapshots: postcss: 8.4.31 react: 19.2.3 react-dom: 19.2.3(react@19.2.3) - styled-jsx: 5.1.6(react@19.2.3) + styled-jsx: 5.1.6(@babel/core@7.28.6)(react@19.2.3) optionalDependencies: '@next/swc-darwin-arm64': 16.1.7 '@next/swc-darwin-x64': 16.1.7 @@ -12775,6 +12954,25 @@ snapshots: regex: 6.1.0 regex-recursion: 6.0.2 + onnxruntime-common@1.24.0-dev.20251116-b39e144322: {} + + onnxruntime-common@1.24.3: {} + + onnxruntime-node@1.24.3: + dependencies: + adm-zip: 0.5.17 + global-agent: 3.0.0 + onnxruntime-common: 1.24.3 + + onnxruntime-web@1.26.0-dev.20260416-b7804b056c: + dependencies: + flatbuffers: 25.9.23 + guid-typescript: 1.0.9 + long: 5.3.2 + onnxruntime-common: 1.24.0-dev.20251116-b39e144322 + platform: 1.3.6 + protobufjs: 7.6.1 + open@11.0.0: dependencies: default-browser: 5.4.0 @@ -12894,6 +13092,8 @@ snapshots: mlly: 1.8.0 pathe: 2.0.3 + platform@1.3.6: {} + playwright-core@1.58.2: {} playwright@1.58.2: @@ -13066,6 +13266,21 @@ snapshots: prosemirror-state: 1.4.4 prosemirror-transform: 1.11.0 + protobufjs@7.6.1: + dependencies: + '@protobufjs/aspromise': 1.1.2 + '@protobufjs/base64': 1.1.2 + '@protobufjs/codegen': 2.0.5 + '@protobufjs/eventemitter': 1.1.1 + '@protobufjs/fetch': 1.1.1 + '@protobufjs/float': 1.0.2 + '@protobufjs/inquire': 1.1.2 + '@protobufjs/path': 1.1.2 + '@protobufjs/pool': 1.1.0 + '@protobufjs/utf8': 1.1.1 + '@types/node': 20.19.28 + long: 5.3.2 + proxy-addr@2.0.7: dependencies: forwarded: 0.2.0 @@ -13399,6 +13614,15 @@ snapshots: reusify@1.1.0: {} + roarr@2.15.4: + dependencies: + boolean: 3.2.0 + detect-node: 2.1.0 + globalthis: 1.0.4 + json-stringify-safe: 5.0.1 + semver-compare: 1.0.0 + sprintf-js: 1.1.3 + robust-predicates@3.0.2: {} rollup@4.55.1: @@ -13488,6 +13712,8 @@ snapshots: refa: 0.12.1 regexp-ast-analysis: 0.7.1 + semver-compare@1.0.0: {} + semver@6.3.1: {} semver@7.7.4: {} @@ -13508,6 +13734,10 @@ snapshots: transitivePeerDependencies: - supports-color + serialize-error@7.0.1: + dependencies: + type-fest: 0.13.1 + serve-static@2.2.1: dependencies: encodeurl: 2.0.0 @@ -13615,7 +13845,6 @@ snapshots: '@img/sharp-win32-arm64': 0.34.5 '@img/sharp-win32-ia32': 0.34.5 '@img/sharp-win32-x64': 0.34.5 - optional: true shebang-command@2.0.0: dependencies: @@ -13683,6 +13912,8 @@ snapshots: split2@4.2.0: {} + sprintf-js@1.1.3: {} + stable-hash@0.0.5: {} stackback@0.0.2: {} @@ -13807,10 +14038,12 @@ snapshots: dependencies: inline-style-parser: 0.2.7 - styled-jsx@5.1.6(react@19.2.3): + styled-jsx@5.1.6(@babel/core@7.28.6)(react@19.2.3): dependencies: client-only: 0.0.1 react: 19.2.3 + optionalDependencies: + '@babel/core': 7.28.6 stylis@4.3.6: {} @@ -13926,6 +14159,8 @@ snapshots: type-detect@4.1.0: {} + type-fest@0.13.1: {} + type-fest@5.4.0: dependencies: tagged-tag: 1.0.0 diff --git a/apps/web/app/settings/voice-mode/page.tsx b/apps/web/app/settings/voice-mode/page.tsx new file mode 100644 index 000000000..2bcc1b851 --- /dev/null +++ b/apps/web/app/settings/voice-mode/page.tsx @@ -0,0 +1,21 @@ +import { VoiceModeSettings } from "@/components/settings/voice-mode-settings"; +import { StateProvider } from "@/components/state-provider"; +import { fetchUserSettings } from "@/lib/api"; +import { mapUserSettingsResponse } from "@/lib/ssr/user-settings"; + +export default async function VoiceModeSettingsPage() { + let initialState = {}; + try { + const response = await fetchUserSettings({ cache: "no-store" }); + const mapped = mapUserSettingsResponse(response); + initialState = { userSettings: mapped.loaded ? mapped : undefined }; + } catch { + initialState = {}; + } + + return ( + + + + ); +} diff --git a/apps/web/components/settings/editors-settings-state.tsx b/apps/web/components/settings/editors-settings-state.tsx index 2d2e76e3b..ef891eeb5 100644 --- a/apps/web/components/settings/editors-settings-state.tsx +++ b/apps/web/components/settings/editors-settings-state.tsx @@ -7,7 +7,7 @@ import { createEditor, deleteEditor, updateEditor, updateUserSettings } from "@/ import { useRequest } from "@/lib/http/use-request"; import type { EditorOption } from "@/lib/types/http"; import { type ComboboxOption } from "@/components/combobox"; -import { parseTerminalLinkBehavior } from "@/lib/ssr/user-settings"; +import { parseTerminalLinkBehavior, parseVoiceMode } from "@/lib/ssr/user-settings"; import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire"; import { type EditorFormState, @@ -245,6 +245,7 @@ function buildUserSettingsFromResponse( terminalFontFamily: s.terminal_font_family || null, terminalFontSize: s.terminal_font_size || null, changesPanelLayout: s.changes_panel_layout === "tree" ? ("tree" as const) : ("flat" as const), + voiceMode: parseVoiceMode(s.voice_mode), ...mapEditorSettingsFields(s), }; } diff --git a/apps/web/components/settings/keyboard-shortcuts-card.tsx b/apps/web/components/settings/keyboard-shortcuts-card.tsx index 7b1c363be..adb484e27 100644 --- a/apps/web/components/settings/keyboard-shortcuts-card.tsx +++ b/apps/web/components/settings/keyboard-shortcuts-card.tsx @@ -17,7 +17,7 @@ import { useAppStore } from "@/components/state-provider"; import { useToast } from "@/components/toast-provider"; import { updateUserSettings } from "@/lib/api/domains/settings-api"; -function ShortcutRecorder({ +export function ShortcutRecorder({ shortcutId, current, onChange, diff --git a/apps/web/components/settings/settings-app-sidebar.tsx b/apps/web/components/settings/settings-app-sidebar.tsx index 20993c191..1ea984397 100644 --- a/apps/web/components/settings/settings-app-sidebar.tsx +++ b/apps/web/components/settings/settings-app-sidebar.tsx @@ -11,6 +11,7 @@ import { IconCode, IconCpu, IconKey, + IconMicrophone, IconMessageCircle, IconBrandGithub, IconBrandGitlab, @@ -315,6 +316,47 @@ function ExecutorsSidebarSection({ pathname, executors }: ExecutorsSidebarSectio ); } +type SimpleSidebarEntry = { + href: string; + label: string; + Icon: typeof IconBrandGithub; +}; + +/** + * A short row of single-link sidebar entries (Automations, Prompts, Voice + * Mode, Utility Agents, External MCP) — extracted from `SettingsAppSidebar` + * so the parent function stays under the 100-line lint limit. + */ +function SimpleSidebarRows({ + pathname, + entries, +}: { + pathname: string; + entries: SimpleSidebarEntry[]; +}) { + return ( + <> + {entries.map(({ href, label, Icon }) => ( + + + + + {label} + + + + ))} + + ); +} + function SecretsSidebarSection({ pathname }: { pathname: string }) { return ( @@ -369,57 +411,33 @@ export function SettingsAppSidebar() { - {/* Automations */} - - - - - Automations - - - - + - - {/* Prompts */} - - - - - Prompts - - - - - {/* Utility Agents */} - - - - - Utility Agents - - - - + - - - {/* External MCP */} - - - - - External MCP - - - + {/* System */} diff --git a/apps/web/components/settings/voice-mode-settings.tsx b/apps/web/components/settings/voice-mode-settings.tsx new file mode 100644 index 000000000..1a0d702cd --- /dev/null +++ b/apps/web/components/settings/voice-mode-settings.tsx @@ -0,0 +1,518 @@ +"use client"; + +import { useCallback, useMemo, useState } from "react"; +import { IconAlertTriangle, IconMicrophone } from "@tabler/icons-react"; +import { Badge } from "@kandev/ui/badge"; +import { Card, CardContent, CardHeader, CardTitle } from "@kandev/ui/card"; +import { Label } from "@kandev/ui/label"; +import { RadioGroup, RadioGroupItem } from "@kandev/ui/radio-group"; +import { + Select, + SelectContent, + SelectGroup, + SelectItem, + SelectLabel, + SelectTrigger, + SelectValue, +} from "@kandev/ui/select"; +import { Switch } from "@kandev/ui/switch"; +import { useAppStore, useAppStoreApi } from "@/components/state-provider"; +import { useToast } from "@/components/toast-provider"; +import { updateUserSettings } from "@/lib/api"; +import { SettingsSection } from "@/components/settings/settings-section"; +import { ShortcutRecorder } from "@/components/settings/keyboard-shortcuts-card"; +import { detectVoiceCapabilities, type VoiceCapabilities } from "@/lib/voice/capabilities"; +import type { VoiceModeState } from "@/lib/state/slices/settings/types"; +import type { KeyboardShortcut } from "@/lib/keyboard/constants"; +import { + CONFIGURABLE_SHORTCUTS, + getShortcut, + type StoredShortcutOverrides, +} from "@/lib/keyboard/shortcut-overrides"; +import type { + VoiceInputActivationMode, + VoiceInputEngine, + VoiceModeSettings as VoiceModeWire, + WhisperWebModelSize, +} from "@/lib/types/http-voice"; + +// Single source of truth for the language options. Web Speech reads `lang`, +// Whisper engines treat it as a hint. "auto" defers to the browser locale. +const LANGUAGE_OPTIONS: Array<{ value: string; label: string }> = [ + { value: "auto", label: "Auto-detect (browser language)" }, + { value: "en-US", label: "English (United States)" }, + { value: "en-GB", label: "English (United Kingdom)" }, + { value: "es-ES", label: "Spanish (Spain)" }, + { value: "es-MX", label: "Spanish (Mexico)" }, + { value: "pt-PT", label: "Portuguese (Portugal)" }, + { value: "pt-BR", label: "Portuguese (Brazil)" }, + { value: "fr-FR", label: "French" }, + { value: "de-DE", label: "German" }, + { value: "it-IT", label: "Italian" }, + { value: "ja-JP", label: "Japanese" }, + { value: "zh-CN", label: "Chinese (Simplified)" }, +]; + +const WHISPER_MODELS: Array<{ + value: WhisperWebModelSize; + label: string; + size: string; + hint: string; +}> = [ + { value: "tiny", label: "Tiny", size: "~40 MB", hint: "Fastest, lower accuracy" }, + { value: "base", label: "Base", size: "~75 MB", hint: "Balanced default" }, + { value: "small", label: "Small", size: "~240 MB", hint: "Best accuracy, slower load" }, +]; + +function toWire(state: VoiceModeState): VoiceModeWire { + return { + enabled: state.enabled, + engine: state.engine, + language: state.language, + mode: state.mode, + auto_send: state.autoSend, + whisper_web_model: state.whisperWebModel, + }; +} + +// ── Save hook ──────────────────────────────────────────────────────────── + +function useVoiceModeSaver() { + // Read userSettings via the store API (not as a React selector) so the + // async save handler reads the latest snapshot at invocation time instead + // of capturing a stale closure. Without this, concurrent settings updates + // racing with this save (or a rejection rolling back to a stale snapshot) + // can silently overwrite unrelated fields. + const storeApi = useAppStoreApi(); + const setUserSettings = useAppStore((s) => s.setUserSettings); + const { toast } = useToast(); + const [saving, setSaving] = useState(false); + + const save = useCallback( + async (patch: Partial) => { + const current = storeApi.getState().userSettings; + const previous = current.voiceMode; + const next = { ...previous, ...patch }; + setUserSettings({ ...current, voiceMode: next }); + setSaving(true); + try { + await updateUserSettings({ voice_mode: toWire(next) }); + } catch { + // Rollback only the keys this request changed AND only when the live + // value still matches what we optimistically wrote. If a newer save + // for the same key landed first, that's now the truth — reverting + // would silently roll back the user's later edit. + const latest = storeApi.getState().userSettings; + const reverted: Partial = {}; + for (const key of Object.keys(patch) as Array) { + if (latest.voiceMode[key] !== next[key]) continue; + // Cast through unknown so the per-key assignment passes strict checks. + (reverted as Record)[key] = previous[key]; + } + setUserSettings({ + ...latest, + voiceMode: { ...latest.voiceMode, ...reverted }, + }); + toast({ title: "Failed to save Voice Mode setting", variant: "error" }); + } finally { + setSaving(false); + } + }, + [storeApi, setUserSettings, toast], + ); + + return { save, saving }; +} + +// ── Engine card ────────────────────────────────────────────────────────── + +type EngineOption = { + value: VoiceInputEngine; + label: string; + description: string; + badge?: string; + disabled?: boolean; +}; + +function buildEngineOptions(caps: VoiceCapabilities): EngineOption[] { + return [ + { + value: "auto", + label: "Automatic", + description: "Use the best engine available in this browser.", + }, + { + value: "webSpeech", + label: "Web Speech (in-browser)", + description: caps.webSpeech + ? "Free, instant, uses your browser's built-in speech recognition." + : "Not supported in this browser.", + disabled: !caps.webSpeech, + }, + { + value: "whisperWeb", + label: "Whisper Web (private, in-browser)", + description: caps.whisperWeb + ? "Runs OpenAI Whisper entirely on this device. First use downloads the model (40–240 MB)." + : "Not supported in this browser.", + badge: "Local", + disabled: !caps.whisperWeb, + }, + { + value: "whisperServer", + label: "Whisper Server (OpenAI)", + description: caps.audioCapture + ? "Sends audio to the backend, which forwards it to OpenAI's Whisper API. Requires a configured API key on the server." + : "Not supported in this browser.", + badge: "Server", + disabled: !caps.audioCapture, + }, + ]; +} + +function EngineCard({ caps }: { caps: VoiceCapabilities }) { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + const options = useMemo(() => buildEngineOptions(caps), [caps]); + + return ( + + + Transcription Engine + + + save({ engine: v as VoiceInputEngine })} + disabled={saving} + className="space-y-3" + > + {options.map((opt) => ( + + ))} + + + + ); +} + +// ── Behavior card (language + mode + auto-send) ────────────────────────── + +function LanguageRow() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( +
+ + +

+ Recognition quality drops sharply when the language doesn't match what you're + speaking. +

+
+ ); +} + +function ModeRow() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( +
+ + save({ mode: v as VoiceInputActivationMode })} + disabled={saving} + className="flex gap-4" + > + + + +
+ ); +} + +function AutoSendRow() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( +
+
+ +

+ Submit the message as soon as the transcript is inserted. +

+
+ save({ autoSend: checked })} + disabled={saving} + /> +
+ ); +} + +function BehaviorCard() { + return ( + + + Behavior + + + + + + + + ); +} + +// ── Whisper Web model card ─────────────────────────────────────────────── + +function WhisperModelCard() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + + return ( + + + Whisper Web Model + + + save({ whisperWebModel: v as WhisperWebModelSize })} + disabled={saving} + className="space-y-2" + > + {WHISPER_MODELS.map((m) => ( + + ))} + +

+ The model downloads on first use and is cached in your browser. Switching models triggers + another download next time you record. +

+
+
+ ); +} + +// ── Enable card (top-level on/off) ─────────────────────────────────────── + +function EnableCard() { + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const { save, saving } = useVoiceModeSaver(); + return ( + + + Enable Voice Input + + +
+
+ +

+ When off, the voice button is hidden entirely and no voice-related code runs. Settings + below are preserved and re-applied when you turn it back on. +

+
+ save({ enabled: checked })} + disabled={saving} + /> +
+
+
+ ); +} + +// ── Availability banner ────────────────────────────────────────────────── + +function AvailabilityBanner({ caps }: { caps: VoiceCapabilities }) { + if (caps.webSpeech || caps.whisperWeb || caps.audioCapture) return null; + // Secure-context requirement is the most common reason capability detection + // returns all-false on mobile (when reaching the dev server over LAN HTTP). + // Spell it out so the user doesn't have to guess. + const insecure = typeof window !== "undefined" && !window.isSecureContext; + return ( +
+ +
+

Voice input is unavailable in this browser.

+

+ {insecure + ? "Microphone APIs require HTTPS or localhost. You appear to be on an insecure HTTP origin — load this page over HTTPS (or http://localhost) to enable voice input." + : "Your browser doesn't expose either the Web Speech API or MediaRecorder. Try Chrome, Edge, or Safari 14.5+."} +

+
+
+ ); +} + +// ── Voice keyboard shortcut card ───────────────────────────────────────── + +function useShortcutSaver() { + // Same stale-closure protection as useVoiceModeSaver — read live store + // state at call time so a concurrent keyboard-shortcut change from another + // settings card isn't clobbered by this card's optimistic update / rollback. + const storeApi = useAppStoreApi(); + const setUserSettings = useAppStore((s) => s.setUserSettings); + const { toast } = useToast(); + return useCallback( + (next: StoredShortcutOverrides) => { + const current = storeApi.getState().userSettings; + const previous = current.keyboardShortcuts; + setUserSettings({ ...current, keyboardShortcuts: next }); + updateUserSettings({ keyboard_shortcuts: next }).catch(() => { + // Rollback only the keys this request changed AND only when the live + // value still matches what we optimistically wrote. Skip otherwise so + // a newer successful save to the same key isn't reverted. + const latest = storeApi.getState().userSettings; + const restored: StoredShortcutOverrides = { ...latest.keyboardShortcuts }; + const changedKeys = new Set([...Object.keys(previous), ...Object.keys(next)]); + for (const key of changedKeys) { + if (previous[key] === next[key]) continue; + if (latest.keyboardShortcuts[key] !== next[key]) continue; + if (previous[key] === undefined) delete restored[key]; + else restored[key] = previous[key]; + } + setUserSettings({ ...latest, keyboardShortcuts: restored }); + toast({ title: "Failed to save shortcut", variant: "error" }); + }); + }, + [storeApi, setUserSettings, toast], + ); +} + +function VoiceShortcutCard() { + const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts); + const persist = useShortcutSaver(); + const current = getShortcut("VOICE_INPUT_TOGGLE", overrides); + + const handleChange = useCallback( + (_id: string, shortcut: KeyboardShortcut) => + persist({ ...overrides, VOICE_INPUT_TOGGLE: shortcut }), + [overrides, persist], + ); + const handleReset = useCallback(() => { + const next = { ...overrides }; + delete next.VOICE_INPUT_TOGGLE; + persist(next); + }, [overrides, persist]); + + return ( + + + + {CONFIGURABLE_SHORTCUTS.VOICE_INPUT_TOGGLE.label} Shortcut + + + + +

+ Click the shortcut to record a new key combination. All keyboard shortcuts can also be + edited in General Settings. +

+
+
+ ); +} + +// ── Page ───────────────────────────────────────────────────────────────── + +export function VoiceModeSettings() { + const caps = useMemo(() => detectVoiceCapabilities(), []); + const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled); + return ( + } + title="Voice Mode" + description="Configure how voice input works on the chat composer." + > +
+ + {/* When voice is disabled, keep showing the secondary cards but dim + them — preserves the visible configuration without implying it has + any effect right now. */} +
+
+ + + + + +
+
+
+
+ ); +} diff --git a/apps/web/components/task/chat/chat-input-body.tsx b/apps/web/components/task/chat/chat-input-body.tsx index aad579f9d..1141c038a 100644 --- a/apps/web/components/task/chat/chat-input-body.tsx +++ b/apps/web/components/task/chat/chat-input-body.tsx @@ -52,6 +52,10 @@ export type ChatInputEditorAreaProps = { onEnhancePrompt?: () => void; isEnhancingPrompt?: boolean; isUtilityConfigured?: boolean; + /** Inserts a voice transcript into the editor at the current cursor. */ + onVoiceTranscript?: (text: string) => void; + /** Submit the message after a voice transcript is inserted (when auto-send is on). */ + onVoiceAutoSend?: () => void; }; function EditorWithTooltip({ @@ -123,6 +127,7 @@ export function ChatInputEditorArea(p: ChatInputEditorAreaProps) { const { isSending, onCancel, contextCount, contextPopoverOpen, setContextPopoverOpen } = p; const { contextFiles, onImplementPlan, onEnhancePrompt, isEnhancingPrompt } = p; const { isUtilityConfigured, hideSessionsDropdown, minimalToolbar, hidePlanMode } = p; + const { onVoiceTranscript, onVoiceAutoSend } = p; // Exclude auto-added plan context from the count — it's always present in plan mode // and shouldn't by itself enable the send button. const userContextCount = planContextEnabled ? Math.max(0, contextCount - 1) : contextCount; @@ -186,6 +191,8 @@ export function ChatInputEditorArea(p: ChatInputEditorAreaProps) { isEnhancingPrompt={isEnhancingPrompt} isUtilityConfigured={isUtilityConfigured} onAttachFiles={handleAttachFiles} + onVoiceTranscript={onVoiceTranscript} + onVoiceAutoSend={onVoiceAutoSend} hideSessionsDropdown={hideSessionsDropdown} minimalToolbar={minimalToolbar} hidePlanMode={hidePlanMode} diff --git a/apps/web/components/task/chat/chat-input-container.tsx b/apps/web/components/task/chat/chat-input-container.tsx index 5975e4550..22ffd9cd9 100644 --- a/apps/web/components/task/chat/chat-input-container.tsx +++ b/apps/web/components/task/chat/chat-input-container.tsx @@ -250,6 +250,8 @@ type EnhancePromptExtras = { onEnhancePrompt?: () => void; isEnhancingPrompt?: boolean; isUtilityConfigured?: boolean; + onVoiceTranscript?: (text: string) => void; + onVoiceAutoSend?: () => void; }; function buildEditorAreaProps( @@ -295,6 +297,8 @@ function buildEditorAreaProps( onEnhancePrompt: extras.onEnhancePrompt, isEnhancingPrompt: extras.isEnhancingPrompt, isUtilityConfigured: extras.isUtilityConfigured, + onVoiceTranscript: extras.onVoiceTranscript, + onVoiceAutoSend: extras.onVoiceAutoSend, hideSessionsDropdown: p.hideSessionsDropdown, minimalToolbar: p.minimalToolbar, hidePlanMode: p.hidePlanMode, @@ -359,6 +363,34 @@ export const ChatInputContainer = forwardRef { + const editor = s.inputRef.current; + if (!editor) return; + const trimmed = text.trim(); + if (!trimmed) return; + const cursor = editor.getSelectionStart(); + const current = editor.getValue(); + // Prepend a space when inserting after existing non-whitespace content + // so transcripts flow naturally without running into the previous word. + const charBefore = cursor > 0 ? current.charAt(cursor - 1) : ""; + const needsLeadingSpace = charBefore !== "" && !/\s/.test(charBefore); + const insert = needsLeadingSpace ? ` ${trimmed}` : trimmed; + editor.insertText(insert, cursor, cursor); + }, + [s.inputRef], + ); + + // Auto-send fires the same submit path as the regular send button. Guards + // against firing while the input is in a disabled state (e.g. the agent + // is currently booting) — the button is hidden in that case anyway, but + // defence-in-depth so a stale keyboard shortcut press doesn't trigger. + const { submitDisabled: voiceSubmitDisabled, handleSubmitWithReset: voiceSubmit } = s; + const handleVoiceAutoSend = useCallback(() => { + if (voiceSubmitDisabled) return; + voiceSubmit(); + }, [voiceSubmitDisabled, voiceSubmit]); + if (p.isFailed || executorUnavailable) { return ( ); diff --git a/apps/web/components/task/chat/chat-input-toolbar.tsx b/apps/web/components/task/chat/chat-input-toolbar.tsx index 2447211d5..d8cb9ca10 100644 --- a/apps/web/components/task/chat/chat-input-toolbar.tsx +++ b/apps/web/components/task/chat/chat-input-toolbar.tsx @@ -31,6 +31,7 @@ import { ModeSelector } from "@/components/task/mode-selector"; import { ContextPopover } from "./context-popover"; import { ResetContextButton } from "./reset-context-button"; import { ImplementPlanButton } from "./implement-plan-button"; +import { VoiceInputButton } from "./voice-input-button"; import type { ContextFile } from "@/lib/state/context-files-store"; export type ChatInputToolbarProps = { @@ -67,6 +68,12 @@ export type ChatInputToolbarProps = { isUtilityConfigured?: boolean; /** Callback to open file picker for attaching files */ onAttachFiles?: () => void; + /** Callback to insert a transcribed voice utterance into the editor. When + * omitted, the voice button is hidden — keeps quick-chat / read-only + * variants free of a button they can't wire. */ + onVoiceTranscript?: (text: string) => void; + /** Optional auto-send hook fired after a voice transcript is inserted. */ + onVoiceAutoSend?: () => void; /** Hide the sessions dropdown (for quick chat) */ hideSessionsDropdown?: boolean; /** When true, only render the submit/cancel button — no other controls */ @@ -308,6 +315,8 @@ function ToolbarRightSection({ onCancel, onSubmit, submitShortcut, + onVoiceTranscript, + onVoiceAutoSend, }: { showCollapsed: boolean; rightItems: ToolbarItemConfig[]; @@ -322,6 +331,8 @@ function ToolbarRightSection({ onCancel: () => void; onSubmit: () => void; submitShortcut: (typeof SHORTCUTS)[keyof typeof SHORTCUTS]; + onVoiceTranscript?: (text: string) => void; + onVoiceAutoSend?: () => void; }) { return (
@@ -330,7 +341,14 @@ function ToolbarRightSection({ {planModeEnabled && !isAgentBusy && onImplementPlan && ( )} -
+
+ {onVoiceTranscript && ( + + )}
); diff --git a/apps/web/components/task/chat/voice-input-button.tsx b/apps/web/components/task/chat/voice-input-button.tsx new file mode 100644 index 000000000..978bfebda --- /dev/null +++ b/apps/web/components/task/chat/voice-input-button.tsx @@ -0,0 +1,265 @@ +"use client"; + +import { useCallback, useEffect, useRef } from "react"; +import { IconLoader2, IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react"; + +import { Button } from "@kandev/ui/button"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@kandev/ui/tooltip"; +import { cn } from "@/lib/utils"; +import { + useVoiceInput, + type VoiceError, + type VoiceInputState, + type VoiceModelLoadState, +} from "@/hooks/use-voice-input"; +import { useAppStore } from "@/components/state-provider"; +import { useKeyboardShortcut } from "@/hooks/use-keyboard-shortcut"; +import { useToast } from "@/components/toast-provider"; +import { getShortcut } from "@/lib/keyboard/shortcut-overrides"; + +type VoiceInputButtonProps = { + /** Inserts the recognized transcript at the current cursor position. */ + onTranscript: (text: string) => void; + /** Called after a non-empty transcript was inserted, when auto-send is enabled. */ + onAutoSend?: () => void; + /** Disable while the chat input itself is disabled (sending / starting / failed). */ + disabled?: boolean; +}; + +const TOOLTIP_BY_STATE: Record = { + idle: "Voice input", + requesting: "Requesting microphone…", + recording: "Stop recording", + processing: "Transcribing…", +}; + +const ARIA_BY_STATE: Record = { + idle: "Start voice input", + requesting: "Requesting microphone permission", + recording: "Stop voice input", + processing: "Transcribing voice input", +}; + +function ButtonIcon({ + state, + modelLoad, +}: { + state: VoiceInputState; + modelLoad: VoiceModelLoadState; +}) { + if (state === "processing" || state === "requesting" || modelLoad.state === "loading") { + return ; + } + if (state === "recording") { + return ; + } + return ; +} + +function toastForError(toast: ReturnType["toast"], err: VoiceError) { + if (err.code === "no-speech") { + toast({ title: err.message }); + return; + } + toast({ title: err.message, variant: "error" }); +} + +// ── Activation handlers ────────────────────────────────────────────────── + +function buildHoldHandlers(start: () => Promise, stop: () => Promise) { + return { + onPointerDown: (e: React.PointerEvent) => { + e.preventDefault(); + void start(); + }, + onPointerUp: (e: React.PointerEvent) => { + e.preventDefault(); + void stop(); + }, + onPointerLeave: () => void stop(), + onPointerCancel: () => void stop(), + }; +} + +function buildToggleHandler( + state: VoiceInputState, + start: () => Promise, + stop: () => Promise, +) { + return () => { + if (state === "idle") void start(); + else if (state === "recording") void stop(); + }; +} + +// ── Hook composition ───────────────────────────────────────────────────── + +function useAutoSendOnTranscript( + baseOnTranscript: (text: string) => void, + onAutoSend: (() => void) | undefined, + enabled: boolean, +) { + // Wrap onTranscript so we can defer auto-send until after the transcript + // has been inserted. requestAnimationFrame keeps a clean separation between + // the editor update and the submit handler, so the editor's onChange has + // already flushed when submit reads from it. + return useCallback( + (text: string) => { + baseOnTranscript(text); + if (enabled && onAutoSend) requestAnimationFrame(onAutoSend); + }, + [baseOnTranscript, onAutoSend, enabled], + ); +} + +function useVoiceShortcut( + enabled: boolean, + state: VoiceInputState, + start: () => Promise, + stop: () => Promise, +) { + const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts); + const shortcut = getShortcut("VOICE_INPUT_TOGGLE", overrides); + const stateRef = useRef(state); + useEffect(() => { + stateRef.current = state; + }, [state]); + const handler = useCallback(() => { + if (stateRef.current === "idle") void start(); + else if (stateRef.current === "recording") void stop(); + }, [start, stop]); + useKeyboardShortcut(shortcut, handler, { enabled }); +} + +// ── Unsupported fallback ──────────────────────────────────────────────── + +function buildUnsupportedReason(): string { + if (typeof window === "undefined") return "Voice input is unavailable here."; + if (!window.isSecureContext) { + return "Voice input needs HTTPS. Open this site over https:// (or http://localhost) — most mobile browsers block microphone APIs on insecure origins."; + } + return "Voice input isn't supported in this browser. Try Chrome, Edge, or Safari 14.5+."; +} + +function UnsupportedVoiceButton({ disabled }: { disabled?: boolean }) { + const { toast } = useToast(); + const handleClick = () => { + toast({ + title: "Voice input unavailable", + description: buildUnsupportedReason(), + variant: "error", + }); + }; + return ( + + + + + Voice input unavailable — tap for details + + ); +} + +// ── Component ──────────────────────────────────────────────────────────── + +export function VoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) { + const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled); + // Render nothing — including no hook subscriptions — when the user has + // disabled the feature in settings. Distinct from `!supported` (browser + // limitation) which shows a tappable greyed icon. Done as a sub-component + // so the unconditional hook count stays the same in the active path. + if (!enabled) return null; + return ( + + ); +} + +function EnabledVoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) { + const { toast } = useToast(); + const voiceMode = useAppStore((s) => s.userSettings.voiceMode); + const handleError = useCallback((err: VoiceError) => toastForError(toast, err), [toast]); + const wrappedTranscript = useAutoSendOnTranscript(onTranscript, onAutoSend, voiceMode.autoSend); + + const { supported, state, modelLoad, start, stop, cancel } = useVoiceInput({ + onTranscript: wrappedTranscript, + onError: handleError, + }); + + // If the chat input gets disabled mid-recording, cancel rather than leave + // the mic indicator on. Hold-mode pointerup may not fire if focus moves. + useEffect(() => { + if (disabled && (state === "recording" || state === "requesting")) cancel(); + }, [disabled, state, cancel]); + + useVoiceShortcut(supported && !disabled, state, start, stop); + + // Always render the button — even when unsupported — so users can see it on + // mobile and tap to learn why voice input isn't working (usually a missing + // secure context, e.g. when reaching the dev server over LAN HTTP). Hiding + // the button silently left mobile users with no discoverable feedback. + if (!supported) return ; + + const isRecording = state === "recording"; + const isBusy = state === "requesting" || state === "processing" || modelLoad.state === "loading"; + const holdMode = voiceMode.mode === "hold"; + + const pointerHandlers = holdMode ? buildHoldHandlers(start, stop) : {}; + const onClick = holdMode ? undefined : buildToggleHandler(state, start, stop); + + // Styled to mirror SubmitButton (h-7 w-7 rounded-full primary fill) so the + // two prominent input actions read as a pair on the right of the toolbar. + // Recording flips to a destructive fill with a pulsing ring so the active + // state is unmistakable even on mobile. + return ( + + + + + + {modelLoad.state === "loading" + ? `Loading model… ${Math.round(modelLoad.progress * 100)}%` + : `${TOOLTIP_BY_STATE[state]}${holdMode && state === "idle" ? " (hold)" : ""}`} + + + ); +} diff --git a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts index 3722d205b..fdbcbfb42 100644 --- a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts +++ b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts @@ -95,8 +95,10 @@ test.describe("Toolbar overflow menu", () => { // Context badge should be hidden when collapsed to avoid clipping await expect(contextBadge).not.toBeVisible(); - // Submit button should remain visible (always-visible item) - const submitBtn = toolbar.locator("button.rounded-full"); + // Submit button should remain visible (always-visible item). Target the + // submit testid specifically — the voice input button is also round, so a + // bare `button.rounded-full` locator now matches both and fails strict mode. + const submitBtn = toolbar.getByTestId("submit-message-button"); await expect(submitBtn).toBeVisible(); // Click expand toggle — items appear inline (scrollable) diff --git a/apps/web/hooks/use-user-display-settings.ts b/apps/web/hooks/use-user-display-settings.ts index 250e2bac2..c06dfb5c2 100644 --- a/apps/web/hooks/use-user-display-settings.ts +++ b/apps/web/hooks/use-user-display-settings.ts @@ -6,7 +6,10 @@ import { useAppStore } from "@/components/state-provider"; import { useRepositories } from "@/hooks/domains/workspace/use-repositories"; import { mapUserSettingsResponse } from "@/lib/ssr/user-settings"; import { repositoryId, type Repository } from "@/lib/types/http"; -import type { UserSettingsState } from "@/lib/state/slices/settings/types"; +import { + DEFAULT_VOICE_MODE_STATE, + type UserSettingsState, +} from "@/lib/state/slices/settings/types"; type DisplaySettings = UserSettingsState; @@ -36,7 +39,15 @@ function carryForwardTerminalSettings(current: DisplaySettings) { }; } -function carryForwardSettings(current: DisplaySettings) { +function carryForwardLspSettings(current: DisplaySettings) { + return { + lspAutoStartLanguages: current.lspAutoStartLanguages ?? [], + lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [], + lspServerConfigs: current.lspServerConfigs ?? {}, + }; +} + +function carryForwardCoreSettings(current: DisplaySettings) { return { shellOptions: current.shellOptions ?? [], defaultEditorId: current.defaultEditorId ?? null, @@ -44,14 +55,19 @@ function carryForwardSettings(current: DisplaySettings) { reviewAutoMarkOnScroll: current.reviewAutoMarkOnScroll ?? true, showReleaseNotification: current.showReleaseNotification ?? true, releaseNotesLastSeenVersion: current.releaseNotesLastSeenVersion ?? null, - lspAutoStartLanguages: current.lspAutoStartLanguages ?? [], - lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [], - lspServerConfigs: current.lspServerConfigs ?? {}, savedLayouts: current.savedLayouts ?? [], sidebarViews: current.sidebarViews ?? [], defaultUtilityAgentId: current.defaultUtilityAgentId ?? null, keyboardShortcuts: current.keyboardShortcuts ?? {}, changesPanelLayout: current.changesPanelLayout ?? "flat", + voiceMode: current.voiceMode ?? { ...DEFAULT_VOICE_MODE_STATE }, + }; +} + +function carryForwardSettings(current: DisplaySettings) { + return { + ...carryForwardCoreSettings(current), + ...carryForwardLspSettings(current), ...carryForwardTerminalSettings(current), }; } diff --git a/apps/web/hooks/use-voice-input.test.ts b/apps/web/hooks/use-voice-input.test.ts new file mode 100644 index 000000000..3137cceba --- /dev/null +++ b/apps/web/hooks/use-voice-input.test.ts @@ -0,0 +1,199 @@ +import { act, renderHook, waitFor } from "@testing-library/react"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +// ── Hoisted mocks (defined before the modules they replace are evaluated) ── + +const voicePrefs = vi.hoisted(() => ({ + value: { + engine: "auto" as "auto" | "webSpeech" | "whisperWeb" | "whisperServer", + language: "auto", + mode: "toggle" as "toggle" | "hold", + autoSend: false, + whisperWebModel: "base" as "tiny" | "base" | "small", + }, +})); + +vi.mock("@/components/state-provider", () => ({ + useAppStore: ( + selector: (state: { userSettings: { voiceMode: typeof voicePrefs.value } }) => unknown, + ) => selector({ userSettings: { voiceMode: voicePrefs.value } }), +})); + +const transcribeAudio = vi.hoisted(() => vi.fn()); +vi.mock("@/lib/api/domains/voice-api", () => ({ transcribeAudio })); + +// ── Mock SpeechRecognition ───────────────────────────────────────────── + +type SpeechHandle = { + start: () => void; + stop: () => void; + abort: () => void; + onresult: ((ev: { resultIndex: number; results: unknown }) => void) | null; + onerror: ((ev: { error: string }) => void) | null; + onend: (() => void) | null; + continuous: boolean; + interimResults: boolean; + maxAlternatives: number; + lang: string; + startCalls: number; + stopCalls: number; + abortCalls: number; +}; + +let recognitionInstance: SpeechHandle | null = null; + +// Factory pattern instead of `class` so we can avoid aliasing `this` in the +// constructor (the lint rule disallows it) while still satisfying the +// `new ()` shape that useVoiceInput's `new Ctor()` calls. +function FakeSpeechRecognition() { + const handle: SpeechHandle = { + continuous: false, + interimResults: false, + maxAlternatives: 1, + lang: "", + onresult: null, + onerror: null, + onend: null, + startCalls: 0, + stopCalls: 0, + abortCalls: 0, + start() { + handle.startCalls += 1; + }, + stop() { + handle.stopCalls += 1; + }, + abort() { + handle.abortCalls += 1; + }, + }; + recognitionInstance = handle; + return handle; +} + +// Import after mocks so the module under test sees the mocked store. +import { useVoiceInput } from "./use-voice-input"; + +// ── Tests ─────────────────────────────────────────────────────────────── + +beforeEach(() => { + voicePrefs.value = { + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }; + recognitionInstance = null; + transcribeAudio.mockReset(); + (window as unknown as { SpeechRecognition: unknown }).SpeechRecognition = + FakeSpeechRecognition as unknown as new () => SpeechHandle; + // MediaRecorder/getUserMedia not used in the auto→webSpeech path, but provide + // a stub so capability detection sees audioCapture available too. + (window as unknown as { MediaRecorder: { isTypeSupported: () => boolean } }).MediaRecorder = { + isTypeSupported: () => true, + }; + Object.defineProperty(global.navigator, "mediaDevices", { + value: { getUserMedia: vi.fn() }, + configurable: true, + }); +}); + +afterEach(() => { + delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition; + delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition; + delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; +}); + +describe("useVoiceInput — Web Speech engine", () => { + it("reports supported and resolves engine = webSpeech under the default auto preference", () => { + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + expect(result.current.supported).toBe(true); + expect(result.current.engine).toBe("webSpeech"); + }); + + it("transitions idle → recording on start() and emits the final transcript on stop()", async () => { + const onTranscript = vi.fn(); + const { result } = renderHook(() => useVoiceInput({ onTranscript })); + + await act(async () => { + await result.current.start(); + }); + expect(result.current.state).toBe("recording"); + expect(recognitionInstance?.startCalls).toBe(1); + + act(() => { + recognitionInstance?.onresult?.({ + resultIndex: 0, + results: { + length: 1, + 0: { isFinal: true, length: 1, 0: { transcript: "hello world" } }, + } as unknown, + }); + recognitionInstance?.onend?.(); + }); + + await waitFor(() => { + expect(onTranscript).toHaveBeenCalledWith("hello world"); + expect(result.current.state).toBe("idle"); + }); + }); + + it("maps a not-allowed permission error to a permission-denied VoiceError", async () => { + const onError = vi.fn(); + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), onError })); + + await act(async () => { + await result.current.start(); + }); + act(() => { + recognitionInstance?.onerror?.({ error: "not-allowed" }); + }); + + expect(onError).toHaveBeenCalledWith({ + code: "permission-denied", + message: "Microphone permission denied.", + }); + expect(result.current.state).toBe("idle"); + }); +}); + +describe("useVoiceInput — capability gating", () => { + it("returns supported=false and engine=null when no engine is usable", () => { + delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition; + delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; + Object.defineProperty(global.navigator, "mediaDevices", { value: {}, configurable: true }); + + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + expect(result.current.supported).toBe(false); + expect(result.current.engine).toBeNull(); + }); + + it("disables the hook entirely when enabled=false", () => { + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), enabled: false })); + expect(result.current.supported).toBe(false); + expect(result.current.engine).toBeNull(); + }); +}); + +describe("useVoiceInput — language preference", () => { + it("passes the pinned BCP-47 language to SpeechRecognition.lang", async () => { + voicePrefs.value = { ...voicePrefs.value, language: "pt-PT" }; + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + + await act(async () => { + await result.current.start(); + }); + expect(recognitionInstance?.lang).toBe("pt-PT"); + }); + + it("falls back to navigator.language when 'auto'", async () => { + voicePrefs.value = { ...voicePrefs.value, language: "auto" }; + Object.defineProperty(global.navigator, "language", { value: "fr-FR", configurable: true }); + const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() })); + await act(async () => { + await result.current.start(); + }); + expect(recognitionInstance?.lang).toBe("fr-FR"); + }); +}); diff --git a/apps/web/hooks/use-voice-input.ts b/apps/web/hooks/use-voice-input.ts new file mode 100644 index 000000000..454df30f8 --- /dev/null +++ b/apps/web/hooks/use-voice-input.ts @@ -0,0 +1,493 @@ +"use client"; + +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; +import { ApiError } from "@/lib/api/client"; +import { transcribeAudio } from "@/lib/api/domains/voice-api"; +import { detectVoiceCapabilities, resolveActiveEngine } from "@/lib/voice/capabilities"; +import { WhisperWebClient, type WhisperWebProgress } from "@/lib/voice/whisper-web-client"; +import { useAppStore } from "@/components/state-provider"; +import type { VoiceInputEngine, WhisperWebModelSize } from "@/lib/types/http-voice"; + +// ── Public types ──────────────────────────────────────────────────────── + +export type VoiceInputState = "idle" | "requesting" | "recording" | "processing"; + +export type VoiceErrorCode = + | "permission-denied" + | "no-speech" + | "not-configured" + | "network" + | "unsupported" + | "model-load" + | "unknown"; + +export type VoiceError = { code: VoiceErrorCode; message: string }; + +export type VoiceModelLoadState = { + state: "idle" | "loading" | "ready" | "error"; + progress: number; +}; + +export type UseVoiceInputOptions = { + onTranscript: (text: string) => void; + onError?: (error: VoiceError) => void; + /** Set false to disable the hook entirely (e.g. for read-only contexts). */ + enabled?: boolean; +}; + +export type UseVoiceInputResult = { + supported: boolean; + engine: Exclude | null; + state: VoiceInputState; + error: VoiceError | null; + modelLoad: VoiceModelLoadState; + start: () => Promise; + stop: () => Promise; + cancel: () => void; +}; + +// ── Web Speech typings (DOM lib doesn't ship them) ───────────────────── + +type SpeechAlt = { transcript: string }; +type SpeechResult = { isFinal: boolean; 0: SpeechAlt; length: number }; +type SpeechResultList = { length: number; [index: number]: SpeechResult }; +type SpeechResultEvent = { resultIndex: number; results: SpeechResultList }; +type SpeechErrorEvent = { error: string; message?: string }; +type SpeechRecognitionInstance = { + lang: string; + continuous: boolean; + interimResults: boolean; + maxAlternatives: number; + start: () => void; + stop: () => void; + abort: () => void; + onresult: ((ev: SpeechResultEvent) => void) | null; + onerror: ((ev: SpeechErrorEvent) => void) | null; + onend: (() => void) | null; +}; + +type SpeechCtor = new () => SpeechRecognitionInstance; + +function createSpeechRecognition(): SpeechRecognitionInstance | null { + if (typeof window === "undefined") return null; + const w = window as Window & { + SpeechRecognition?: SpeechCtor; + webkitSpeechRecognition?: SpeechCtor; + }; + const Ctor = w.SpeechRecognition ?? w.webkitSpeechRecognition; + return Ctor ? new Ctor() : null; +} + +// ── Error mappers ─────────────────────────────────────────────────────── + +function mapSpeechError(code: string): VoiceError { + if (code === "not-allowed" || code === "service-not-allowed") { + return { code: "permission-denied", message: "Microphone permission denied." }; + } + if (code === "no-speech") return { code: "no-speech", message: "No speech detected. Try again." }; + if (code === "network") { + return { code: "network", message: "Voice recognition lost network connection." }; + } + if (code === "audio-capture") return { code: "unknown", message: "No microphone was found." }; + return { code: "unknown", message: `Voice recognition error: ${code}` }; +} + +function mapMicError(err: unknown): VoiceError { + if (err && typeof err === "object" && "name" in err) { + const name = (err as { name: string }).name; + if (name === "NotAllowedError" || name === "SecurityError") { + return { code: "permission-denied", message: "Microphone permission denied." }; + } + if (name === "NotFoundError" || name === "OverconstrainedError") { + return { code: "unknown", message: "No microphone was found." }; + } + } + return { code: "unknown", message: "Failed to start recording." }; +} + +function mapTranscribeError(err: unknown): VoiceError { + if (err instanceof ApiError && err.status === 503) { + return { + code: "not-configured", + message: + "Server-side transcription isn't configured. Pick Web Speech or Whisper Web in Voice Mode settings.", + }; + } + return { code: "network", message: "Transcription failed. Please try again." }; +} + +function whisperErrorMessage(err: unknown): VoiceError { + const message = err instanceof Error ? err.message : "Whisper Web failed to transcribe."; + return { code: "model-load", message }; +} + +function resolveLang(preference: string): string { + if (preference && preference !== "auto") return preference; + return typeof navigator !== "undefined" ? navigator.language : "en-US"; +} + +function resolveWhisperLang(preference: string): string | undefined { + if (!preference || preference === "auto") return undefined; + // Whisper's tokenizer only knows ISO 639-1 two-letter codes ("en", "pt"). + // The settings UI stores BCP-47 ("en-US", "pt-BR") so we can render + // human-friendly variant names — strip the region suffix here so the hint + // isn't silently dropped by the pipeline (which would then auto-detect and + // potentially pick the wrong dialect). + const dash = preference.indexOf("-"); + return dash > 0 ? preference.slice(0, dash).toLowerCase() : preference.toLowerCase(); +} + +// ── MediaRecorder capture primitive ───────────────────────────────────── + +function pickRecorderMime(): { mime: string; ext: string } { + if (typeof window === "undefined" || typeof window.MediaRecorder === "undefined") { + return { mime: "", ext: "webm" }; + } + const candidates: Array<{ mime: string; ext: string }> = [ + { mime: "audio/webm;codecs=opus", ext: "webm" }, + { mime: "audio/webm", ext: "webm" }, + { mime: "audio/mp4", ext: "m4a" }, + { mime: "audio/ogg;codecs=opus", ext: "ogg" }, + { mime: "audio/wav", ext: "wav" }, + ]; + for (const c of candidates) { + if (window.MediaRecorder.isTypeSupported(c.mime)) return c; + } + return { mime: "", ext: "webm" }; +} + +type CaptureHandle = { + stream: MediaStream; + recorder: MediaRecorder; + chunks: Blob[]; + mime: string; + ext: string; +}; + +async function startCapture(): Promise { + const { mime, ext } = pickRecorderMime(); + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const recorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined); + const chunks: Blob[] = []; + recorder.addEventListener("dataavailable", (e) => { + if (e.data && e.data.size > 0) chunks.push(e.data); + }); + recorder.start(); + return { stream, recorder, chunks, mime, ext }; +} + +function teardownCapture(handle: CaptureHandle | null) { + if (!handle) return; + for (const t of handle.stream.getTracks()) t.stop(); +} + +function stopCapture(handle: CaptureHandle): Promise { + return new Promise((resolve) => { + if (handle.recorder.state === "inactive") { + teardownCapture(handle); + resolve(null); + return; + } + handle.recorder.addEventListener( + "stop", + () => { + const type = handle.recorder.mimeType || handle.mime || "audio/webm"; + const blob = handle.chunks.length > 0 ? new Blob(handle.chunks, { type }) : null; + teardownCapture(handle); + resolve(blob); + }, + { once: true }, + ); + handle.recorder.stop(); + }); +} + +// ── Driver refs ───────────────────────────────────────────────────────── + +type ActiveDriverRef = + | { kind: "webSpeech"; recognition: SpeechRecognitionInstance } + | { kind: "capture"; handle: CaptureHandle; engine: "whisperWeb" | "whisperServer" } + | null; + +type DriverRefBox = { current: ActiveDriverRef }; +type WhisperRefBox = { current: WhisperWebClient | null }; + +function abortDriver(ref: DriverRefBox) { + const driver = ref.current; + if (!driver) return; + if (driver.kind === "webSpeech") { + // Detach callbacks before aborting so the trailing onerror/onend events + // that some browsers fire after .abort() don't sneak through and mutate + // hook state that the caller (cancel()) just reset. + driver.recognition.onresult = null; + driver.recognition.onerror = null; + driver.recognition.onend = null; + driver.recognition.abort(); + } else teardownCapture(driver.handle); + ref.current = null; +} + +// ── Web Speech driver ─────────────────────────────────────────────────── + +type WebSpeechHandlers = { + setState: (s: VoiceInputState) => void; + driverRef: DriverRefBox; + emitError: (e: VoiceError) => void; + onTranscriptRef: { current: (text: string) => void }; + lang: string; +}; + +function runWebSpeech(h: WebSpeechHandlers): void { + const recognition = createSpeechRecognition(); + if (!recognition) { + h.emitError({ code: "unsupported", message: "Voice recognition is not supported." }); + return; + } + const transcripts: string[] = []; + recognition.continuous = true; + recognition.interimResults = false; + recognition.maxAlternatives = 1; + recognition.lang = h.lang; + recognition.onresult = (ev) => { + for (let i = ev.resultIndex; i < ev.results.length; i++) { + const r = ev.results[i]; + if (r.isFinal && r[0]?.transcript) transcripts.push(r[0].transcript.trim()); + } + }; + recognition.onerror = (ev) => h.emitError(mapSpeechError(ev.error)); + recognition.onend = () => { + h.driverRef.current = null; + h.setState("idle"); + const joined = transcripts.join(" ").trim(); + if (joined) h.onTranscriptRef.current(joined); + }; + try { + recognition.start(); + h.driverRef.current = { kind: "webSpeech", recognition }; + h.setState("recording"); + } catch { + h.emitError({ code: "unknown", message: "Failed to start voice recognition." }); + } +} + +// ── Capture engines (whisperWeb + whisperServer) ─────────────────────── + +type CaptureHandlers = { + setState: (s: VoiceInputState) => void; + emitError: (e: VoiceError) => void; + driverRef: DriverRefBox; +}; + +async function beginCapture( + which: "whisperWeb" | "whisperServer", + h: CaptureHandlers, +): Promise { + h.setState("requesting"); + try { + const handle = await startCapture(); + h.driverRef.current = { kind: "capture", handle, engine: which }; + h.setState("recording"); + } catch (err) { + h.emitError(mapMicError(err)); + } +} + +type FinishCaptureHandlers = { + driverRef: DriverRefBox; + whisperRef: WhisperRefBox; + setState: (s: VoiceInputState) => void; + setModelLoad: (next: VoiceModelLoadState) => void; + emitError: (e: VoiceError) => void; + onTranscriptRef: { current: (text: string) => void }; + whisperModel: WhisperWebModelSize; + language: string; +}; + +async function finishCapture(h: FinishCaptureHandlers): Promise { + const driver = h.driverRef.current; + if (!driver || driver.kind !== "capture") return; + // Claim the driver synchronously *before* the first await. In hold mode, + // pointerup + pointerleave both fire in the same task and both call stop(); + // without this early null, the second invocation would also enter + // finishCapture, race the first, and could clobber a brand-new recording's + // driverRef if the user re-triggered between them. + h.driverRef.current = null; + h.setState("processing"); + const blob = await stopCapture(driver.handle); + if (!blob) { + h.setState("idle"); + return; + } + try { + const text = + driver.engine === "whisperServer" + ? await transcribeViaServer(blob, driver.handle.ext) + : await transcribeViaWhisperWeb(blob, h); + if (text) h.onTranscriptRef.current(text); + h.setState("idle"); + } catch (err) { + if (driver.engine === "whisperServer") h.emitError(mapTranscribeError(err)); + else h.emitError(whisperErrorMessage(err)); + } +} + +async function transcribeViaServer(blob: Blob, ext: string): Promise { + const result = await transcribeAudio(blob, `recording.${ext}`); + return result.text.trim(); +} + +async function transcribeViaWhisperWeb(blob: Blob, h: FinishCaptureHandlers): Promise { + const client = await ensureWhisperClient(h); + const text = await client.transcribe(blob, resolveWhisperLang(h.language)); + return text.trim(); +} + +async function ensureWhisperClient(h: FinishCaptureHandlers): Promise { + if (!h.whisperRef.current) { + h.whisperRef.current = new WhisperWebClient({ + onProgress: (p: WhisperWebProgress) => + // transformers.js emits progress on a 0–100 scale, but the rest of the + // pipeline (and the button's `* 100` display) treats `modelLoad.progress` + // as a 0–1 fraction (matching the `ready: 1` convention below). Normalise + // here so the button doesn't render "5000%" mid-download. + h.setModelLoad({ state: "loading", progress: p.progress / 100 }), + }); + h.setModelLoad({ state: "loading", progress: 0 }); + } + try { + await h.whisperRef.current.init(h.whisperModel); + h.setModelLoad({ state: "ready", progress: 1 }); + } catch (err) { + h.setModelLoad({ state: "error", progress: 0 }); + throw err; + } + return h.whisperRef.current; +} + +// ── Hook helpers ──────────────────────────────────────────────────────── + +function useVoiceModePrefs() { + return useAppStore((s) => s.userSettings.voiceMode); +} + +function useCallbackRefs(opts: UseVoiceInputOptions) { + const onTranscriptRef = useRef(opts.onTranscript); + const onErrorRef = useRef(opts.onError); + useEffect(() => { + onTranscriptRef.current = opts.onTranscript; + onErrorRef.current = opts.onError; + }); + return { onTranscriptRef, onErrorRef }; +} + +// Re-init the whisper client whenever the user switches model size, so we +// don't keep an old in-memory model around when the next start() runs. +function useDisposeWhisperOnModelChange( + whisperRef: WhisperRefBox, + modelSize: string, + reset: () => void, +) { + const previousModelRef = useRef(modelSize); + useEffect(() => { + if (previousModelRef.current === modelSize) return; + previousModelRef.current = modelSize; + whisperRef.current?.dispose(); + whisperRef.current = null; + reset(); + }, [modelSize, whisperRef, reset]); +} + +function useUnmountCleanup(driverRef: DriverRefBox, whisperRef: WhisperRefBox) { + useEffect(() => { + return () => { + abortDriver(driverRef); + whisperRef.current?.dispose(); + whisperRef.current = null; + }; + }, [driverRef, whisperRef]); +} + +// ── Hook ──────────────────────────────────────────────────────────────── + +export function useVoiceInput(opts: UseVoiceInputOptions): UseVoiceInputResult { + const caps = useMemo(() => detectVoiceCapabilities(), []); + const prefs = useVoiceModePrefs(); + const enabled = opts.enabled !== false; + const engine = useMemo( + () => (enabled ? resolveActiveEngine(prefs.engine, caps, true) : null), + [enabled, prefs.engine, caps], + ); + const supported = engine !== null; + + const [state, setState] = useState("idle"); + const [error, setError] = useState(null); + const [modelLoad, setModelLoad] = useState({ + state: "idle", + progress: 0, + }); + + const driverRef = useRef(null); + const whisperRef = useRef(null); + const { onTranscriptRef, onErrorRef } = useCallbackRefs(opts); + + const emitError = useCallback( + (e: VoiceError) => { + setError(e); + setState("idle"); + onErrorRef.current?.(e); + }, + [onErrorRef], + ); + + const resetModelLoad = useCallback(() => setModelLoad({ state: "idle", progress: 0 }), []); + + useUnmountCleanup(driverRef, whisperRef); + useDisposeWhisperOnModelChange(whisperRef, prefs.whisperWebModel, resetModelLoad); + + const start = useCallback(async () => { + if (!supported || !engine) { + emitError({ code: "unsupported", message: "Voice input is not supported in this browser." }); + return; + } + if (state !== "idle") return; + setError(null); + if (engine === "webSpeech") { + runWebSpeech({ + setState, + driverRef, + emitError, + onTranscriptRef, + lang: resolveLang(prefs.language), + }); + return; + } + await beginCapture(engine, { setState, emitError, driverRef }); + }, [supported, engine, state, emitError, prefs.language, onTranscriptRef]); + + const stop = useCallback(async () => { + const driver = driverRef.current; + if (!driver) return; + if (driver.kind === "webSpeech") { + driver.recognition.stop(); + return; + } + await finishCapture({ + driverRef, + whisperRef, + setState, + setModelLoad, + emitError, + onTranscriptRef, + whisperModel: prefs.whisperWebModel, + language: prefs.language, + }); + }, [emitError, prefs.whisperWebModel, prefs.language, onTranscriptRef]); + + const cancel = useCallback(() => { + abortDriver(driverRef); + setState("idle"); + setError(null); + }, []); + + return { supported, engine, state, error, modelLoad, start, stop, cancel }; +} diff --git a/apps/web/lib/api/domains/settings-api.ts b/apps/web/lib/api/domains/settings-api.ts index 343e30efa..ec9b229be 100644 --- a/apps/web/lib/api/domains/settings-api.ts +++ b/apps/web/lib/api/domains/settings-api.ts @@ -21,6 +21,7 @@ import type { UserSettingsResponse, DynamicModelsResponse, } from "@/lib/types/http"; +import type { VoiceModeSettings } from "@/lib/types/http-voice"; // User settings export async function fetchUserSettings(options?: ApiRequestOptions) { @@ -52,6 +53,7 @@ export async function updateUserSettings( terminal_font_family?: string; terminal_font_size?: number; changes_panel_layout?: "flat" | "tree"; + voice_mode?: VoiceModeSettings; }, options?: ApiRequestOptions, ) { diff --git a/apps/web/lib/api/domains/voice-api.test.ts b/apps/web/lib/api/domains/voice-api.test.ts new file mode 100644 index 000000000..d3618cae8 --- /dev/null +++ b/apps/web/lib/api/domains/voice-api.test.ts @@ -0,0 +1,63 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { ApiError } from "../client"; +import { transcribeAudio } from "./voice-api"; + +const originalFetch = global.fetch; + +describe("transcribeAudio", () => { + afterEach(() => { + global.fetch = originalFetch; + }); + + beforeEach(() => { + vi.restoreAllMocks(); + }); + + it("posts multipart/form-data with the audio under the 'audio' field", async () => { + let capturedRequest: { method?: string; bodyText: string } = { bodyText: "" }; + global.fetch = vi.fn(async (_url: RequestInfo | URL, init?: RequestInit) => { + capturedRequest = { + method: init?.method, + bodyText: init?.body instanceof FormData ? "" : String(init?.body), + }; + return new Response(JSON.stringify({ text: "hi" }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }) as unknown as typeof fetch; + + const blob = new Blob([new Uint8Array([1, 2, 3])], { type: "audio/webm" }); + const result = await transcribeAudio(blob, "clip.webm", { + baseUrl: "http://example.test", + }); + + expect(result.text).toBe("hi"); + expect(capturedRequest.method).toBe("POST"); + expect(capturedRequest.bodyText).toBe(""); + }); + + it("throws ApiError(503) when the server reports not-configured", async () => { + global.fetch = vi.fn( + async () => + new Response(JSON.stringify({ error: "voice transcription is not configured" }), { + status: 503, + }), + ) as unknown as typeof fetch; + + const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" }); + await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toMatchObject({ + status: 503, + }); + }); + + it("surfaces non-2xx errors as ApiError instances", async () => { + global.fetch = vi.fn( + async () => new Response("bad", { status: 502, statusText: "Bad Gateway" }), + ) as unknown as typeof fetch; + + const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" }); + await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toBeInstanceOf( + ApiError, + ); + }); +}); diff --git a/apps/web/lib/api/domains/voice-api.ts b/apps/web/lib/api/domains/voice-api.ts new file mode 100644 index 000000000..d3af1a571 --- /dev/null +++ b/apps/web/lib/api/domains/voice-api.ts @@ -0,0 +1,51 @@ +import { ApiError, type ApiRequestOptions } from "../client"; +import { getBackendConfig } from "@/lib/config"; + +export type TranscribeResponse = { + text: string; +}; + +/** + * POST audio to the backend Whisper fallback. Returns the transcribed text. + * + * Throws ApiError on non-2xx. Two statuses are meaningful to the caller: + * - 503: server has no API key configured — the hook should treat the + * Whisper fallback as unavailable and surface a clean message. + * - any other non-2xx: transient error — show a generic toast. + */ +export async function transcribeAudio( + blob: Blob, + filename: string, + options?: ApiRequestOptions, +): Promise { + const baseUrl = options?.baseUrl ?? getBackendConfig().apiBaseUrl; + const formData = new FormData(); + formData.append("audio", blob, filename); + + // Do NOT set Content-Type: the browser sets multipart/form-data with the + // correct boundary automatically when given a FormData body. Spread caller + // init *first* so method/body always win — otherwise a caller passing + // `init: { method: "GET" }` (or a stale body) would silently break the upload. + const response = await fetch(`${baseUrl}/api/v1/transcribe`, { + ...options?.init, + method: "POST", + body: formData, + }); + + if (!response.ok) { + let body: unknown = null; + try { + body = await response.json(); + } catch { + // body remains null + } + let message = `Transcription failed: ${response.status} ${response.statusText}`; + if (body && typeof body === "object" && "error" in body) { + const errVal = (body as { error?: unknown }).error; + if (typeof errVal === "string") message = errVal; + } + throw new ApiError(message, response.status, body); + } + + return (await response.json()) as TranscribeResponse; +} diff --git a/apps/web/lib/keyboard/constants.ts b/apps/web/lib/keyboard/constants.ts index 31271c0b9..e05ab2373 100644 --- a/apps/web/lib/keyboard/constants.ts +++ b/apps/web/lib/keyboard/constants.ts @@ -153,4 +153,10 @@ export const SHORTCUTS = { key: KEYS.F, modifiers: { ctrlOrCmd: true }, }, + // Cmd+Shift+M starts/stops voice input on the chat composer. The default + // is configurable per-user via the Voice Mode settings page. + VOICE_INPUT_TOGGLE: { + key: KEYS.M, + modifiers: { ctrlOrCmd: true, shift: true }, + }, } as const; diff --git a/apps/web/lib/keyboard/shortcut-overrides.test.ts b/apps/web/lib/keyboard/shortcut-overrides.test.ts index 6453bc902..43c59c3df 100644 --- a/apps/web/lib/keyboard/shortcut-overrides.test.ts +++ b/apps/web/lib/keyboard/shortcut-overrides.test.ts @@ -20,7 +20,8 @@ describe("CONFIGURABLE_SHORTCUTS", () => { expect(ids).toContain("FOCUS_INPUT"); expect(ids).toContain("TOGGLE_PLAN_MODE"); expect(ids).toContain("TASK_SWITCHER"); - expect(ids).toHaveLength(10); + expect(ids).toContain("VOICE_INPUT_TOGGLE"); + expect(ids).toHaveLength(11); }); it("each entry has a label and default matching SHORTCUTS", () => { diff --git a/apps/web/lib/keyboard/shortcut-overrides.ts b/apps/web/lib/keyboard/shortcut-overrides.ts index 8ac1b7a37..a31d61e15 100644 --- a/apps/web/lib/keyboard/shortcut-overrides.ts +++ b/apps/web/lib/keyboard/shortcut-overrides.ts @@ -10,7 +10,8 @@ export type ConfigurableShortcutId = | "NEW_TASK" | "FOCUS_INPUT" | "TOGGLE_PLAN_MODE" - | "TASK_SWITCHER"; + | "TASK_SWITCHER" + | "VOICE_INPUT_TOGGLE"; export type StoredShortcutOverrides = Record< string, @@ -31,6 +32,7 @@ export const CONFIGURABLE_SHORTCUTS: Record< FOCUS_INPUT: { label: "Focus Chat Input", default: SHORTCUTS.FOCUS_INPUT }, TOGGLE_PLAN_MODE: { label: "Toggle Plan Mode", default: SHORTCUTS.TOGGLE_PLAN_MODE }, TASK_SWITCHER: { label: "Recent Task Switcher", default: SHORTCUTS.TASK_SWITCHER }, + VOICE_INPUT_TOGGLE: { label: "Voice Input", default: SHORTCUTS.VOICE_INPUT_TOGGLE }, }; export function getShortcut( diff --git a/apps/web/lib/ssr/user-settings.test.ts b/apps/web/lib/ssr/user-settings.test.ts index 04f425b0d..38b681b6e 100644 --- a/apps/web/lib/ssr/user-settings.test.ts +++ b/apps/web/lib/ssr/user-settings.test.ts @@ -1,5 +1,10 @@ import { describe, it, expect } from "vitest"; -import { buildCoreFields, mapUserSettingsResponse, parseChangesPanelLayout } from "./user-settings"; +import { + buildCoreFields, + mapUserSettingsResponse, + parseChangesPanelLayout, + parseVoiceMode, +} from "./user-settings"; describe("buildCoreFields", () => { it("maps terminal_font_family to terminalFontFamily", () => { @@ -103,3 +108,78 @@ describe("parseChangesPanelLayout", () => { expect(parseChangesPanelLayout("")).toBe("flat"); }); }); + +describe("parseVoiceMode", () => { + it("maps every field from the snake_case wire payload", () => { + expect( + parseVoiceMode({ + enabled: false, + engine: "whisperWeb", + language: "pt-PT", + mode: "hold", + auto_send: true, + whisper_web_model: "small", + }), + ).toEqual({ + enabled: false, + engine: "whisperWeb", + language: "pt-PT", + mode: "hold", + autoSend: true, + whisperWebModel: "small", + }); + }); + + it("returns the defaults when the payload is undefined", () => { + expect(parseVoiceMode(undefined)).toEqual({ + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }); + }); + + it("defaults enabled to true when the wire payload omits the field (old rows)", () => { + const result = parseVoiceMode({ + engine: "auto", + language: "auto", + mode: "toggle", + auto_send: false, + whisper_web_model: "base", + } as unknown as Parameters[0]); + expect(result.enabled).toBe(true); + }); + + it("fills in defaults for missing string fields and coerces auto_send to false", () => { + const result = parseVoiceMode({ + engine: "" as unknown as "auto", + language: "", + mode: "" as unknown as "toggle", + whisper_web_model: "" as unknown as "base", + } as unknown as Parameters[0]); + expect(result).toEqual({ + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }); + }); +}); + +describe("mapUserSettingsResponse voice mode", () => { + it("defaults the whole voiceMode object when response is null", () => { + const result = mapUserSettingsResponse(null); + expect(result.voiceMode).toEqual({ + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", + }); + }); +}); diff --git a/apps/web/lib/ssr/user-settings.ts b/apps/web/lib/ssr/user-settings.ts index 74a3d127f..b2ed73508 100644 --- a/apps/web/lib/ssr/user-settings.ts +++ b/apps/web/lib/ssr/user-settings.ts @@ -1,6 +1,8 @@ import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire"; import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types"; +import { DEFAULT_VOICE_MODE_STATE, type VoiceModeState } from "@/lib/state/slices/settings/types"; import type { SavedLayout, UserSettingsResponse } from "@/lib/types/http"; +import type { VoiceModeSettings } from "@/lib/types/http-voice"; export type UserSettingsData = NonNullable; @@ -12,6 +14,25 @@ export function parseChangesPanelLayout(value: string | undefined): "flat" | "tr return value === "tree" ? "tree" : "flat"; } +/** + * Maps the backend's snake_case VoiceMode payload into the camelCase shape + * the store and UI use. Missing or partial payloads fall back to the defaults + * so an old user row (written before VoiceMode existed) doesn't surface as + * an empty string the radio groups can't render. `enabled` defaults to true + * for users who haven't toggled it — voice mode is opt-out, not opt-in. + */ +export function parseVoiceMode(value: VoiceModeSettings | undefined): VoiceModeState { + if (!value) return { ...DEFAULT_VOICE_MODE_STATE }; + return { + enabled: typeof value.enabled === "boolean" ? value.enabled : true, + engine: value.engine || DEFAULT_VOICE_MODE_STATE.engine, + language: value.language || DEFAULT_VOICE_MODE_STATE.language, + mode: value.mode || DEFAULT_VOICE_MODE_STATE.mode, + autoSend: typeof value.auto_send === "boolean" ? value.auto_send : false, + whisperWebModel: value.whisper_web_model || DEFAULT_VOICE_MODE_STATE.whisperWebModel, + }; +} + function buildTerminalFields(s: UserSettingsData) { return { terminalLinkBehavior: parseTerminalLinkBehavior(s.terminal_link_behavior), @@ -21,6 +42,10 @@ function buildTerminalFields(s: UserSettingsData) { }; } +function buildVoiceModeFields(s: UserSettingsData) { + return { voiceMode: parseVoiceMode(s.voice_mode) }; +} + function buildIdentityFields(s: UserSettingsData) { return { workspaceId: s.workspace_id || null, @@ -51,6 +76,7 @@ export function buildCoreFields(s: UserSettingsData) { savedLayouts: s.saved_layouts ?? [], sidebarViews: (s.sidebar_views ?? []).map(fromApiSidebarView) as SidebarView[], ...buildTerminalFields(s), + ...buildVoiceModeFields(s), }; } @@ -91,6 +117,7 @@ export function mapUserSettingsResponse(response: UserSettingsResponse | null) { terminalFontFamily: null, terminalFontSize: null, changesPanelLayout: "flat" as const, + voiceMode: { ...DEFAULT_VOICE_MODE_STATE }, ...buildLspFields(undefined), loaded: false, }; diff --git a/apps/web/lib/state/slices/settings/settings-slice.ts b/apps/web/lib/state/slices/settings/settings-slice.ts index 26ce9c67b..d9dca4acb 100644 --- a/apps/web/lib/state/slices/settings/settings-slice.ts +++ b/apps/web/lib/state/slices/settings/settings-slice.ts @@ -1,5 +1,5 @@ import type { StateCreator } from "zustand"; -import type { SettingsSlice, SettingsSliceState } from "./types"; +import { DEFAULT_VOICE_MODE_STATE, type SettingsSlice, type SettingsSliceState } from "./types"; export const defaultSettingsState: SettingsSliceState = { executors: { items: [] }, @@ -44,6 +44,7 @@ export const defaultSettingsState: SettingsSliceState = { terminalFontFamily: null, terminalFontSize: null, changesPanelLayout: "flat", + voiceMode: { ...DEFAULT_VOICE_MODE_STATE }, loaded: false, }, }; diff --git a/apps/web/lib/state/slices/settings/types.ts b/apps/web/lib/state/slices/settings/types.ts index 73f094761..ca7740a93 100644 --- a/apps/web/lib/state/slices/settings/types.ts +++ b/apps/web/lib/state/slices/settings/types.ts @@ -11,6 +11,11 @@ import type { SavedLayout, ToolStatus, } from "@/lib/types/http"; +import type { + VoiceInputActivationMode, + VoiceInputEngine, + WhisperWebModelSize, +} from "@/lib/types/http-voice"; import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types"; import type { SecretListItem } from "@/lib/types/http-secrets"; import type { SpritesStatus, SpritesInstance } from "@/lib/types/http-sprites"; @@ -156,9 +161,29 @@ export type UserSettingsState = { terminalFontFamily: string | null; terminalFontSize: number | null; changesPanelLayout: "flat" | "tree"; + voiceMode: VoiceModeState; loaded: boolean; }; +export type VoiceModeState = { + enabled: boolean; + engine: VoiceInputEngine; + language: string; + mode: VoiceInputActivationMode; + autoSend: boolean; + whisperWebModel: WhisperWebModelSize; +}; + +/** Default values used by the slice init and by SSR hydration fallback. */ +export const DEFAULT_VOICE_MODE_STATE: VoiceModeState = { + enabled: true, + engine: "auto", + language: "auto", + mode: "toggle", + autoSend: false, + whisperWebModel: "base", +}; + export type SettingsSliceState = { executors: ExecutorsState; settingsAgents: SettingsAgentsState; diff --git a/apps/web/lib/types/backend.ts b/apps/web/lib/types/backend.ts index c97912e3c..448dba0f9 100644 --- a/apps/web/lib/types/backend.ts +++ b/apps/web/lib/types/backend.ts @@ -383,6 +383,7 @@ export type UserSettingsUpdatedPayload = { keyboard_shortcuts?: Record }>; terminal_link_behavior?: string; changes_panel_layout?: "flat" | "tree"; + voice_mode?: import("@/lib/types/http-voice").VoiceModeSettings; updated_at?: string; }; diff --git a/apps/web/lib/types/http-voice.ts b/apps/web/lib/types/http-voice.ts new file mode 100644 index 000000000..c43351524 --- /dev/null +++ b/apps/web/lib/types/http-voice.ts @@ -0,0 +1,17 @@ +/** + * Wire types for the Voice Mode user settings. Kept in their own module so + * http.ts stays under the 600-line file limit. + */ + +export type VoiceInputEngine = "auto" | "webSpeech" | "whisperWeb" | "whisperServer"; +export type VoiceInputActivationMode = "toggle" | "hold"; +export type WhisperWebModelSize = "tiny" | "base" | "small"; + +export type VoiceModeSettings = { + enabled: boolean; + engine: VoiceInputEngine; + language: string; + mode: VoiceInputActivationMode; + auto_send: boolean; + whisper_web_model: WhisperWebModelSize; +}; diff --git a/apps/web/lib/types/http.ts b/apps/web/lib/types/http.ts index fae94bf0c..0953be4c3 100644 --- a/apps/web/lib/types/http.ts +++ b/apps/web/lib/types/http.ts @@ -406,6 +406,8 @@ export type SidebarViewApi = { collapsed_groups: string[]; }; +import type { VoiceModeSettings } from "./http-voice"; + export type UserSettings = { user_id: string; workspace_id: WorkspaceId; @@ -432,6 +434,7 @@ export type UserSettings = { terminal_font_family?: string; terminal_font_size?: number; changes_panel_layout?: "flat" | "tree"; + voice_mode?: VoiceModeSettings; updated_at: string; }; diff --git a/apps/web/lib/voice/capabilities.test.ts b/apps/web/lib/voice/capabilities.test.ts new file mode 100644 index 000000000..d8b8d7191 --- /dev/null +++ b/apps/web/lib/voice/capabilities.test.ts @@ -0,0 +1,97 @@ +import { describe, it, expect, afterEach, vi } from "vitest"; +import { detectVoiceCapabilities, resolveActiveEngine } from "./capabilities"; + +describe("detectVoiceCapabilities", () => { + afterEach(() => { + vi.unstubAllGlobals(); + delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition; + delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition; + delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; + }); + + it("reports webSpeech true when window.SpeechRecognition exists", () => { + (window as unknown as { SpeechRecognition: () => void }).SpeechRecognition = () => {}; + expect(detectVoiceCapabilities().webSpeech).toBe(true); + }); + + it("reports webSpeech true on the prefixed webkit variant too", () => { + (window as unknown as { webkitSpeechRecognition: () => void }).webkitSpeechRecognition = + () => {}; + expect(detectVoiceCapabilities().webSpeech).toBe(true); + }); + + it("reports audioCapture true when MediaRecorder + getUserMedia are present", () => { + (window as unknown as { MediaRecorder: object }).MediaRecorder = { + isTypeSupported: () => true, + }; + vi.stubGlobal("navigator", { mediaDevices: { getUserMedia: () => Promise.resolve({}) } }); + expect(detectVoiceCapabilities().audioCapture).toBe(true); + }); + + it("reports everything false when no APIs are available", () => { + vi.stubGlobal("navigator", {}); + expect(detectVoiceCapabilities()).toEqual({ + webSpeech: false, + whisperWeb: false, + audioCapture: false, + }); + }); +}); + +describe("resolveActiveEngine", () => { + const allAvailable = { webSpeech: true, whisperWeb: true, audioCapture: true }; + + it("auto picks webSpeech first when available", () => { + expect(resolveActiveEngine("auto", allAvailable, true)).toBe("webSpeech"); + }); + + it("auto falls back to whisperWeb when webSpeech is missing", () => { + expect( + resolveActiveEngine("auto", { webSpeech: false, whisperWeb: true, audioCapture: true }, true), + ).toBe("whisperWeb"); + }); + + it("auto falls back to whisperServer when no in-browser engine is available", () => { + expect( + resolveActiveEngine( + "auto", + { webSpeech: false, whisperWeb: false, audioCapture: true }, + true, + ), + ).toBe("whisperServer"); + }); + + it("returns null when nothing is usable", () => { + expect( + resolveActiveEngine( + "auto", + { webSpeech: false, whisperWeb: false, audioCapture: false }, + true, + ), + ).toBeNull(); + }); + + it("honors a pinned engine when usable", () => { + expect(resolveActiveEngine("whisperWeb", allAvailable, true)).toBe("whisperWeb"); + }); + + it("falls back along the auto order when the pinned engine is missing", () => { + expect( + resolveActiveEngine( + "whisperWeb", + { webSpeech: true, whisperWeb: false, audioCapture: true }, + true, + ), + ).toBe("webSpeech"); + }); + + it("treats whisperServer as unusable when serverFallbackEnabled is false", () => { + expect( + resolveActiveEngine( + "whisperServer", + { webSpeech: false, whisperWeb: false, audioCapture: true }, + false, + ), + ).toBeNull(); + }); +}); diff --git a/apps/web/lib/voice/capabilities.ts b/apps/web/lib/voice/capabilities.ts new file mode 100644 index 000000000..6fd36f161 --- /dev/null +++ b/apps/web/lib/voice/capabilities.ts @@ -0,0 +1,75 @@ +"use client"; + +import type { VoiceInputEngine } from "@/lib/types/http-voice"; + +/** + * Capability report for the voice-mode engines available in the current + * browser. Shared between `useVoiceInput` (which picks the active engine) + * and the Voice Mode settings page (which decides which options to render). + */ +export type VoiceCapabilities = { + webSpeech: boolean; + whisperWeb: boolean; + /** True if the browser supports MediaRecorder + getUserMedia, the floor + * for any audio-capture engine (whisperWeb + whisperServer). */ + audioCapture: boolean; +}; + +/** + * Detects which voice engines this browser can run. Safe to call during + * SSR — returns all-false instead of throwing on missing globals. + */ +export function detectVoiceCapabilities(): VoiceCapabilities { + if (typeof window === "undefined") { + return { webSpeech: false, whisperWeb: false, audioCapture: false }; + } + const w = window as Window & { + SpeechRecognition?: unknown; + webkitSpeechRecognition?: unknown; + }; + const webSpeech = !!(w.SpeechRecognition || w.webkitSpeechRecognition); + const audioCapture = + typeof navigator !== "undefined" && + typeof navigator.mediaDevices?.getUserMedia === "function" && + typeof window.MediaRecorder !== "undefined"; + // whisper-web piggybacks on transformers.js which only needs a Worker plus + // either WebGPU or WebAssembly. Every modern browser has both, so the + // gating constraint is having MediaRecorder for capture. + const whisperWeb = audioCapture && typeof Worker !== "undefined"; + return { webSpeech, whisperWeb, audioCapture }; +} + +/** + * Resolves the active voice-input engine given a user preference and the + * detected capabilities. Returns null when nothing usable is available. + * + * Auto-fallback order: Web Speech (cheapest, native) → Whisper Web (private, + * heavier) → Whisper Server (always works but requires a configured server). + * If the user pinned a specific engine that isn't available, we degrade + * gracefully along the same order. + */ +export function resolveActiveEngine( + preference: VoiceInputEngine, + caps: VoiceCapabilities, + serverFallbackEnabled: boolean, +): Exclude | null { + const order: Array> = [ + "webSpeech", + "whisperWeb", + "whisperServer", + ]; + + const isUsable = (e: Exclude) => { + if (e === "webSpeech") return caps.webSpeech; + if (e === "whisperWeb") return caps.whisperWeb; + return caps.audioCapture && serverFallbackEnabled; + }; + + if (preference === "auto") { + return order.find(isUsable) ?? null; + } + if (isUsable(preference)) return preference; + // Pinned engine isn't usable — fall through to the next available one in + // the auto order so the button still works instead of silently no-op. + return order.find(isUsable) ?? null; +} diff --git a/apps/web/lib/voice/whisper-web-client.ts b/apps/web/lib/voice/whisper-web-client.ts new file mode 100644 index 000000000..e9d1cc620 --- /dev/null +++ b/apps/web/lib/voice/whisper-web-client.ts @@ -0,0 +1,199 @@ +"use client"; + +import { whisperModelConfig } from "./whisper-web-models"; +import type { WhisperWebModelSize } from "@/lib/types/http-voice"; + +/** + * Sample rate Whisper expects. We resample the captured audio to this rate + * (mono Float32Array) before sending to the worker — Whisper's own decoder + * would do this too, but doing it here keeps the worker focused on inference. + */ +const WHISPER_SAMPLE_RATE = 16000; + +export type WhisperWebProgress = { + stage: string; + progress: number; +}; + +export type WhisperWebHandlers = { + onProgress?: (p: WhisperWebProgress) => void; +}; + +type WorkerMessage = + | { type: "progress"; stage: string; progress: number } + | { type: "ready" } + | { type: "result"; text: string } + | { type: "error"; message: string }; + +type Pending = { + kind: "init" | "transcribe"; + resolve: (value: string | undefined) => void; + reject: (err: Error) => void; +}; + +/** + * Client wrapper around the whisper-web worker. Hides the postMessage + * protocol behind a clean promise-based API and handles the audio decode + + * resample step so callers only see "Blob in, transcript out". + */ +export class WhisperWebClient { + private worker: Worker | null = null; + private pending: Pending | null = null; + private ready = false; + private loadingModelId: string | null = null; + + constructor(private handlers: WhisperWebHandlers = {}) {} + + /** + * Lazy-creates the worker on first use. Returns a promise that resolves + * when the requested model is loaded and ready to transcribe. + */ + async init(size: WhisperWebModelSize): Promise { + const config = whisperModelConfig(size); + if (this.ready && this.loadingModelId === config.modelId) return; + this.ensureWorker(); + this.loadingModelId = config.modelId; + this.ready = false; + await this.send({ kind: "init", payload: { type: "init", model: config.modelId } }); + this.ready = true; + } + + /** + * Transcribe a recorded blob. The blob may be in any container the browser + * can decode (audio/webm, audio/wav, audio/mp4, …) — we resample everything + * to 16 kHz mono Float32 before handing to the worker. + */ + async transcribe(blob: Blob, language?: string): Promise { + if (!this.ready || !this.worker) { + throw new Error("WhisperWebClient: not initialized"); + } + const audio = await blobToWhisperFloat32(blob); + const text = await this.send({ + kind: "transcribe", + payload: { type: "transcribe", audio, language }, + transfer: [audio.buffer], + }); + return text ?? ""; + } + + /** Tear down the worker and release the loaded model. */ + dispose(): void { + if (this.worker) { + try { + this.worker.postMessage({ type: "dispose" }); + } catch { + // ignore + } + this.worker.terminate(); + this.worker = null; + } + this.ready = false; + this.loadingModelId = null; + if (this.pending) { + this.pending.reject(new Error("WhisperWebClient disposed")); + this.pending = null; + } + } + + private ensureWorker() { + if (this.worker) return; + // The `new Worker(new URL(..., import.meta.url))` form is Next.js / webpack's + // recommended pattern — webpack handles the bundling and asset path. + this.worker = new Worker(new URL("../../workers/whisper-web.worker.ts", import.meta.url), { + type: "module", + }); + this.worker.addEventListener("message", (e: MessageEvent) => + this.handleMessage(e.data), + ); + // Capture the worker reference at listener-attach time. A late error from + // a previously-disposed worker can still bubble up after we've already + // created its replacement; without the identity check below, that stale + // event would terminate the brand-new worker too. + const ownWorker = this.worker; + this.worker.addEventListener("error", (e) => { + const err = new Error(e.message || "Whisper worker crashed"); + ownWorker?.terminate(); + // Only clear our refs if this is still the active worker — a stale + // error from a worker we already replaced must not nuke the new one. + if (this.worker === ownWorker) { + this.worker = null; + this.ready = false; + this.loadingModelId = null; + } + if (this.pending) { + this.pending.reject(err); + this.pending = null; + } + }); + } + + private send(args: { + kind: "init" | "transcribe"; + payload: object; + transfer?: Transferable[]; + }): Promise { + if (!this.worker) throw new Error("WhisperWebClient: worker not initialized"); + if (this.pending) { + return Promise.reject(new Error("WhisperWebClient: another request is in flight")); + } + return new Promise((resolve, reject) => { + this.pending = { kind: args.kind, resolve, reject }; + this.worker?.postMessage(args.payload, args.transfer ?? []); + }); + } + + private handleMessage(msg: WorkerMessage) { + if (msg.type === "progress") { + this.handlers.onProgress?.({ stage: msg.stage, progress: msg.progress }); + return; + } + const pending = this.pending; + if (!pending) return; + this.pending = null; + if (msg.type === "error") { + pending.reject(new Error(msg.message)); + return; + } + if (msg.type === "ready") { + pending.resolve(undefined); + return; + } + if (msg.type === "result") { + pending.resolve(msg.text); + } + } +} + +/** + * Decode an arbitrary audio Blob and return a Float32Array sampled at 16 kHz + * mono — the format Whisper expects. + */ +export async function blobToWhisperFloat32(blob: Blob): Promise { + const arrayBuffer = await blob.arrayBuffer(); + // Decode using an AudioContext at the source rate, then bounce through an + // OfflineAudioContext for the resample. AudioContext.decodeAudioData + // tolerates webm/opus, mp4/aac, wav, ogg — anything the browser can play. + const AudioCtor = + window.AudioContext ?? + (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext; + if (!AudioCtor) throw new Error("AudioContext is not available in this browser"); + const decodeCtx = new AudioCtor(); + let decoded: AudioBuffer; + try { + decoded = await decodeCtx.decodeAudioData(arrayBuffer); + } finally { + await decodeCtx.close(); + } + return resampleToMono16k(decoded); +} + +async function resampleToMono16k(buf: AudioBuffer): Promise { + const length = Math.ceil((buf.duration * WHISPER_SAMPLE_RATE) / 1); + const offline = new OfflineAudioContext(1, length, WHISPER_SAMPLE_RATE); + const source = offline.createBufferSource(); + source.buffer = buf; + source.connect(offline.destination); + source.start(0); + const rendered = await offline.startRendering(); + return rendered.getChannelData(0).slice(); +} diff --git a/apps/web/lib/voice/whisper-web-models.ts b/apps/web/lib/voice/whisper-web-models.ts new file mode 100644 index 000000000..eaffe6698 --- /dev/null +++ b/apps/web/lib/voice/whisper-web-models.ts @@ -0,0 +1,42 @@ +import type { WhisperWebModelSize } from "@/lib/types/http-voice"; + +export type WhisperModelConfig = { + size: WhisperWebModelSize; + /** Hugging Face model id. Use the `onnx-community/*` mirrors — `Xenova/*` + * defaults to 4-bit MatMulNBits weights that crash on WASM (see note below). */ + modelId: string; + /** Rough on-disk size after download, shown in the settings UI. */ + approxBytes: number; + /** Human-readable label. */ + label: string; +}; + +// The `onnx-community/whisper-*` mirrors are the maintained transformers.js +// exports. The older `Xenova/whisper-*` mirrors default to 4-bit (`MatMulNBits`) +// weights that only run on WebGPU — on WASM they fail with +// `Missing required scale: ... weight_merged_0_scale`. The onnx-community +// mirrors include the q8 variant we pin to in the worker. +export const WHISPER_WEB_MODELS: Record = { + tiny: { + size: "tiny", + modelId: "onnx-community/whisper-tiny", + approxBytes: 40 * 1024 * 1024, + label: "Whisper Tiny", + }, + base: { + size: "base", + modelId: "onnx-community/whisper-base", + approxBytes: 75 * 1024 * 1024, + label: "Whisper Base", + }, + small: { + size: "small", + modelId: "onnx-community/whisper-small", + approxBytes: 240 * 1024 * 1024, + label: "Whisper Small", + }, +}; + +export function whisperModelConfig(size: WhisperWebModelSize): WhisperModelConfig { + return WHISPER_WEB_MODELS[size] ?? WHISPER_WEB_MODELS.base; +} diff --git a/apps/web/lib/ws/handlers/users.ts b/apps/web/lib/ws/handlers/users.ts index 1ddb7a71c..0b33698d3 100644 --- a/apps/web/lib/ws/handlers/users.ts +++ b/apps/web/lib/ws/handlers/users.ts @@ -1,6 +1,7 @@ import type { StoreApi } from "zustand"; import type { AppState } from "@/lib/state/store"; import type { WsHandlers } from "@/lib/ws/handlers/types"; +import { parseVoiceMode } from "@/lib/ssr/user-settings"; export function registerUsersHandlers(store: StoreApi): WsHandlers { return { @@ -31,6 +32,7 @@ export function registerUsersHandlers(store: StoreApi): WsHandlers { ? "browser_panel" : "new_tab", changesPanelLayout: message.payload.changes_panel_layout === "tree" ? "tree" : "flat", + voiceMode: parseVoiceMode(message.payload.voice_mode), loaded: true, }, })); diff --git a/apps/web/package.json b/apps/web/package.json index 7de93f075..369517e61 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -38,6 +38,7 @@ "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", "@dnd-kit/utilities": "^3.2.2", + "@huggingface/transformers": "^4.2.0", "@kandev/theme": "workspace:*", "@kandev/types": "workspace:*", "@kandev/ui": "workspace:*", diff --git a/apps/web/workers/whisper-web.worker.ts b/apps/web/workers/whisper-web.worker.ts new file mode 100644 index 000000000..68fa33e4b --- /dev/null +++ b/apps/web/workers/whisper-web.worker.ts @@ -0,0 +1,138 @@ +/// + +/** + * Web Worker that runs OpenAI Whisper entirely in the browser via + * @huggingface/transformers (the maintained transformers.js library that + * xenova/whisper-web is built on). + * + * Lives in its own worker because model loading + inference both block the + * main thread for several seconds — would freeze the chat input otherwise. + * + * Wire protocol (postMessage): + * in: { type: "init", model: "onnx-community/whisper-base" } + * in: { type: "transcribe", audio: Float32Array, language?: string } + * in: { type: "dispose" } + * out: { type: "progress", stage: string, progress: number } + * out: { type: "ready" } + * out: { type: "result", text: string } + * out: { type: "error", message: string } + */ + +import { pipeline, env, type AutomaticSpeechRecognitionPipeline } from "@huggingface/transformers"; + +// Disable transformers.js's local-models lookup — we only load from the HF +// CDN so the worker doesn't try to fetch files from our own origin. +env.allowLocalModels = false; +env.allowRemoteModels = true; + +type InitMessage = { type: "init"; model: string }; +type TranscribeMessage = { type: "transcribe"; audio: Float32Array; language?: string }; +type DisposeMessage = { type: "dispose" }; +type InMessage = InitMessage | TranscribeMessage | DisposeMessage; + +type OutMessage = + | { type: "progress"; stage: string; progress: number } + | { type: "ready" } + | { type: "result"; text: string } + | { type: "error"; message: string }; + +const ctx = self as unknown as DedicatedWorkerGlobalScope; + +let asrPipeline: AutomaticSpeechRecognitionPipeline | null = null; +let activeModelId: string | null = null; + +function post(message: OutMessage) { + ctx.postMessage(message); +} + +type ProgressEvent = { + status?: string; + file?: string; + progress?: number; +}; + +async function handleInit(msg: InitMessage) { + if (asrPipeline && activeModelId === msg.model) { + post({ type: "ready" }); + return; + } + if (asrPipeline) { + await asrPipeline.dispose(); + asrPipeline = null; + } + try { + // dtype choice rationale: the `_quantized` / `q8` and `q4` decoder weights + // for whisper-base both contain `MatMulNBits` ops that only execute on + // WebGPU. On browsers without WebGPU (most Firefox, older Chrome) onnxruntime + // throws `Missing required scale: ... weight_merged_0_scale`. fp16 has no + // quantized ops at all so it works on both WASM and WebGPU; it's ~half the + // size of fp32 with no perceptible accuracy loss for ASR. + const created = await pipeline("automatic-speech-recognition", msg.model, { + dtype: { + encoder_model: "fp32", + decoder_model_merged: "fp16", + }, + progress_callback: (e: ProgressEvent) => { + if (typeof e?.progress === "number") { + post({ + type: "progress", + stage: e.status ?? "download", + progress: e.progress, + }); + } + }, + }); + asrPipeline = created as AutomaticSpeechRecognitionPipeline; + activeModelId = msg.model; + post({ type: "ready" }); + } catch (err) { + post({ type: "error", message: errorMessage(err) }); + } +} + +async function handleTranscribe(msg: TranscribeMessage) { + if (!asrPipeline) { + post({ type: "error", message: "Whisper worker not initialized" }); + return; + } + try { + const result = (await asrPipeline(msg.audio, { + language: msg.language && msg.language !== "auto" ? msg.language : undefined, + task: "transcribe", + })) as { text?: string } | Array<{ text?: string }>; + const text = Array.isArray(result) + ? result.map((r) => r.text ?? "").join(" ") + : (result.text ?? ""); + post({ type: "result", text: text.trim() }); + } catch (err) { + post({ type: "error", message: errorMessage(err) }); + } +} + +async function handleDispose() { + if (asrPipeline) { + await asrPipeline.dispose(); + asrPipeline = null; + activeModelId = null; + } +} + +function errorMessage(err: unknown): string { + if (err instanceof Error) return err.message; + return String(err); +} + +ctx.addEventListener("message", (event: MessageEvent) => { + const msg = event.data; + switch (msg.type) { + case "init": + void handleInit(msg); + break; + case "transcribe": + void handleTranscribe(msg); + break; + case "dispose": + void handleDispose(); + break; + } +});