diff --git a/apps/backend/cmd/kandev/helpers.go b/apps/backend/cmd/kandev/helpers.go
index 58eeec1f5..dccbb750d 100644
--- a/apps/backend/cmd/kandev/helpers.go
+++ b/apps/backend/cmd/kandev/helpers.go
@@ -71,6 +71,8 @@ import (
 	userhandlers "github.com/kandev/kandev/internal/user/handlers"
 	utilitycontroller "github.com/kandev/kandev/internal/utility/controller"
 	utilityhandlers "github.com/kandev/kandev/internal/utility/handlers"
+	voicehandlers "github.com/kandev/kandev/internal/voice/handlers"
+	"github.com/kandev/kandev/internal/voice/transcribe"
 	workflowcontroller "github.com/kandev/kandev/internal/workflow/controller"
 	workflowhandlers "github.com/kandev/kandev/internal/workflow/handlers"
 	"github.com/kandev/kandev/internal/worktree"
@@ -449,6 +451,7 @@ type routeParams struct {
 	devMode                 bool
 	httpPort                int
 	features                config.FeaturesConfig
+	voice                   config.VoiceConfig
 	log                     *logger.Logger
 }
 
@@ -698,6 +701,11 @@ func registerSecondaryRoutes(
 	utilityhandlers.RegisterRoutes(p.router, p.utilityCtrl, p.lifecycleMgr, p.hostUtilityMgr, p.services.User, p.log)
 	p.log.Debug("Registered Utility Agents handlers (HTTP)")
 
+	// Voice transcription fallback. The route always mounts, but returns 503
+	// when no API key is configured so the frontend can hide the path.
+	voicehandlers.RegisterRoutes(p.router, transcribe.New(p.voice.OpenAIAPIKey), p.log)
+	p.log.Debug("Registered Voice handlers (HTTP)")
+
 	agentcapabilities.RegisterRoutes(p.router, p.hostUtilityMgr, p.log)
 	p.log.Debug("Registered Agent Capabilities handlers (HTTP)")
 
diff --git a/apps/backend/cmd/kandev/main.go b/apps/backend/cmd/kandev/main.go
index fa3070382..4f771448c 100644
--- a/apps/backend/cmd/kandev/main.go
+++ b/apps/backend/cmd/kandev/main.go
@@ -1513,6 +1513,7 @@ func buildHTTPServer(
 		devMode:                 cfg.Debug.DevMode || cfg.Debug.PprofEnabled,
 		httpPort:                port,
 		features:                cfg.Features,
+		voice:                   cfg.Voice,
 		log:                     log,
 	})
 
diff --git a/apps/backend/internal/common/config/config.go b/apps/backend/internal/common/config/config.go
index 2d76bcca4..9028d3dce 100644
--- a/apps/backend/internal/common/config/config.go
+++ b/apps/backend/internal/common/config/config.go
@@ -41,6 +41,7 @@ type Config struct {
 	RepoClone           RepoCloneConfig           `mapstructure:"repoClone"`
 	Debug               DebugConfig               `mapstructure:"debug"`
 	Office              OfficeConfig              `mapstructure:"office"`
+	Voice               VoiceConfig               `mapstructure:"voice"`
 	Features            FeaturesConfig            `mapstructure:"features"`
 }
 
@@ -147,6 +148,20 @@ type OfficeConfig struct {
 	JWTSigningKey string `mapstructure:"jwtSigningKey"`
 }
 
+// VoiceConfig holds configuration for the chat voice-input transcription
+// fallback. The primary voice-input engine runs entirely in the browser
+// (Web Speech API); this server-side fallback is only used when the browser
+// has no SpeechRecognition support (e.g. Firefox).
+//
+// When OpenAIAPIKey is empty the /api/v1/transcribe endpoint returns 503
+// and the frontend hides the fallback path, so the feature is safe to
+// ship un-configured.
+type VoiceConfig struct {
+	// OpenAIAPIKey is the API key used to call OpenAI's Whisper transcription
+	// endpoint. Set via KANDEV_VOICE_OPENAI_API_KEY.
+	OpenAIAPIKey string `mapstructure:"openAIApiKey"`
+}
+
 // FeaturesConfig is the central registry of runtime feature flags. Every flag
 // defaults to false so production binaries ship with new work hidden until a
 // deployment explicitly opts in (env var, e.g. KANDEV_FEATURES_OFFICE=true).
@@ -312,6 +327,9 @@ func setDefaults(v *viper.Viper) {
 	// Office defaults
 	v.SetDefault("office.jwtSigningKey", "")
 
+	// Voice defaults
+	v.SetDefault("voice.openAIApiKey", "")
+
 	// Feature-flag defaults live in ./features.yaml (symlinked to
 	// apps/backend/internal/features/features.yaml). LoadWithPath applies
 	// them via features.ApplyDefaults after this function returns so the
@@ -428,6 +446,7 @@ func LoadWithPath(configPath string) (*Config, error) {
 	_ = v.BindEnv("events.namespace", "KANDEV_EVENTS_NAMESPACE")
 	_ = v.BindEnv("debug.devMode", "KANDEV_DEBUG_DEV_MODE")
 	_ = v.BindEnv("debug.pprofEnabled", "KANDEV_DEBUG_PPROF_ENABLED")
+	_ = v.BindEnv("voice.openAIApiKey", "KANDEV_VOICE_OPENAI_API_KEY")
 
 	// Configure config file
 	v.SetConfigName("config")
diff --git a/apps/backend/internal/user/controller/controller.go b/apps/backend/internal/user/controller/controller.go
index 2b6cb8c8f..1ca49c98e 100644
--- a/apps/backend/internal/user/controller/controller.go
+++ b/apps/backend/internal/user/controller/controller.go
@@ -68,6 +68,7 @@ func (c *Controller) UpdateUserSettings(ctx context.Context, req dto.UpdateUserS
 		TerminalFontFamily:          req.TerminalFontFamily,
 		TerminalFontSize:            req.TerminalFontSize,
 		ChangesPanelLayout:          req.ChangesPanelLayout,
+		VoiceMode:                   req.VoiceMode,
 	})
 	if err != nil {
 		return dto.UserSettingsResponse{}, err
diff --git a/apps/backend/internal/user/dto/dto.go b/apps/backend/internal/user/dto/dto.go
index 450f11b2d..3329aeb1a 100644
--- a/apps/backend/internal/user/dto/dto.go
+++ b/apps/backend/internal/user/dto/dto.go
@@ -39,6 +39,7 @@ type UserSettingsDTO struct {
 	TerminalFontFamily          string                            `json:"terminal_font_family"`
 	TerminalFontSize            int                               `json:"terminal_font_size"`
 	ChangesPanelLayout          string                            `json:"changes_panel_layout"`
+	VoiceMode                   models.VoiceModeSettings          `json:"voice_mode"`
 	UpdatedAt                   string                            `json:"updated_at"`
 }
 
@@ -82,6 +83,7 @@ type UpdateUserSettingsRequest struct {
 	TerminalFontFamily          *string                            `json:"terminal_font_family,omitempty"`
 	TerminalFontSize            *int                               `json:"terminal_font_size,omitempty"`
 	ChangesPanelLayout          *string                            `json:"changes_panel_layout,omitempty"`
+	VoiceMode                   *models.VoiceModeSettings          `json:"voice_mode,omitempty"`
 }
 
 func FromUser(user *models.User) UserDTO {
@@ -120,6 +122,7 @@ func FromUserSettings(settings *models.UserSettings) UserSettingsDTO {
 		TerminalFontFamily:          settings.TerminalFontFamily,
 		TerminalFontSize:            settings.TerminalFontSize,
 		ChangesPanelLayout:          settings.ChangesPanelLayout,
+		VoiceMode:                   settings.VoiceMode,
 		UpdatedAt:                   settings.UpdatedAt.Format(time.RFC3339),
 	}
 }
diff --git a/apps/backend/internal/user/models/models.go b/apps/backend/internal/user/models/models.go
index 4b48a5ff4..80475e904 100644
--- a/apps/backend/internal/user/models/models.go
+++ b/apps/backend/internal/user/models/models.go
@@ -38,10 +38,36 @@ type UserSettings struct {
 	TerminalFontFamily          string                            `json:"terminal_font_family"`
 	TerminalFontSize            int                               `json:"terminal_font_size"`
 	ChangesPanelLayout          string                            `json:"changes_panel_layout"` // "flat" | "tree"
+	VoiceMode                   VoiceModeSettings                 `json:"voice_mode"`
 	CreatedAt                   time.Time                         `json:"created_at"`
 	UpdatedAt                   time.Time                         `json:"updated_at"`
 }
 
+// VoiceModeSettings is the per-user configuration surface for the chat
+// voice-input feature. Stored as a nested JSON object inside the `users.settings`
+// blob — adding fields here does not require a schema migration.
+type VoiceModeSettings struct {
+	// Enabled gates the whole feature. When false, the mic button is hidden
+	// entirely and no voice-related hooks run on the chat input. Defaults to
+	// true for new users; pre-existing user rows that have no `enabled` field
+	// in their stored JSON are also treated as enabled (see store layer).
+	Enabled bool `json:"enabled"`
+	// Engine is the user's preferred transcription engine.
+	// "auto" | "webSpeech" | "whisperWeb" | "whisperServer". Default "auto".
+	Engine string `json:"engine"`
+	// Language is the BCP-47 tag or "auto" to use the browser's language.
+	// Examples: "en-US", "pt-PT", "ja-JP". Default "auto".
+	Language string `json:"language"`
+	// Mode controls how the mic button is activated: "toggle" (click to start/stop)
+	// or "hold" (push-to-talk). Default "toggle".
+	Mode string `json:"mode"`
+	// AutoSend submits the chat message immediately after the transcript is inserted.
+	AutoSend bool `json:"auto_send"`
+	// WhisperWebModel selects the in-browser Whisper model when engine = whisperWeb.
+	// "tiny" | "base" | "small". Default "base".
+	WhisperWebModel string `json:"whisper_web_model"`
+}
+
 // SavedLayout represents a user-saved dockview layout configuration.
 type SavedLayout struct {
 	ID        string          `json:"id"`
diff --git a/apps/backend/internal/user/service/service.go b/apps/backend/internal/user/service/service.go
index f83991a86..1a2709335 100644
--- a/apps/backend/internal/user/service/service.go
+++ b/apps/backend/internal/user/service/service.go
@@ -58,6 +58,7 @@ type UpdateUserSettingsRequest struct {
 	TerminalFontFamily          *string
 	TerminalFontSize            *int
 	ChangesPanelLayout          *string
+	VoiceMode                   *models.VoiceModeSettings
 }
 
 func NewService(repo store.Repository, eventBus bus.EventBus, log *logger.Logger) *Service {
@@ -122,6 +123,9 @@ func (s *Service) UpdateUserSettings(ctx context.Context, req *UpdateUserSetting
 	if err := applySidebarViews(settings, req); err != nil {
 		return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error())
 	}
+	if err := applyVoiceMode(settings, req.VoiceMode); err != nil {
+		return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error())
+	}
 	settings.UpdatedAt = time.Now().UTC()
 	if err := s.repo.UpsertUserSettings(ctx, settings); err != nil {
 		return nil, err
@@ -220,6 +224,66 @@ func applyChangesPanelLayout(settings *models.UserSettings, value *string) error
 	return nil
 }
 
+var (
+	validVoiceEngines = map[string]struct{}{
+		"auto":          {},
+		"webSpeech":     {},
+		"whisperWeb":    {},
+		"whisperServer": {},
+	}
+	validVoiceModes = map[string]struct{}{
+		"toggle": {},
+		"hold":   {},
+	}
+	validWhisperWebModels = map[string]struct{}{
+		"tiny":  {},
+		"base":  {},
+		"small": {},
+	}
+)
+
+// applyVoiceMode validates the inbound voice-mode settings and merges them
+// onto the user record. Each sub-field is validated independently so a
+// partial update (e.g. just `engine`) still works.
+//
+// `enabled` and `auto_send` are plain bools — every PATCH carries them. The
+// settings UI always sends the full VoiceMode object so partial updates that
+// would otherwise zero these are not a real concern.
+func applyVoiceMode(settings *models.UserSettings, value *models.VoiceModeSettings) error {
+	if value == nil {
+		return nil
+	}
+	current := settings.VoiceMode
+	if current.Engine == "" {
+		current.Engine = "auto"
+	}
+	if value.Engine != "" {
+		if _, ok := validVoiceEngines[value.Engine]; !ok {
+			return errors.New("voice_mode.engine must be 'auto', 'webSpeech', 'whisperWeb', or 'whisperServer'")
+		}
+		current.Engine = value.Engine
+	}
+	if value.Language != "" {
+		current.Language = strings.TrimSpace(value.Language)
+	}
+	if value.Mode != "" {
+		if _, ok := validVoiceModes[value.Mode]; !ok {
+			return errors.New("voice_mode.mode must be 'toggle' or 'hold'")
+		}
+		current.Mode = value.Mode
+	}
+	if value.WhisperWebModel != "" {
+		if _, ok := validWhisperWebModels[value.WhisperWebModel]; !ok {
+			return errors.New("voice_mode.whisper_web_model must be 'tiny', 'base', or 'small'")
+		}
+		current.WhisperWebModel = value.WhisperWebModel
+	}
+	current.AutoSend = value.AutoSend
+	current.Enabled = value.Enabled
+	settings.VoiceMode = current
+	return nil
+}
+
 // applyChatSubmitKey validates and applies the chat_submit_key setting.
 func (s *Service) applyChatSubmitKey(settings *models.UserSettings, req *UpdateUserSettingsRequest) error {
 	if req.ChatSubmitKey == nil {
@@ -332,6 +396,7 @@ func (s *Service) publishUserSettingsEvent(ctx context.Context, settings *models
 		"terminal_font_family":            settings.TerminalFontFamily,
 		"terminal_font_size":              settings.TerminalFontSize,
 		"changes_panel_layout":            settings.ChangesPanelLayout,
+		"voice_mode":                      settings.VoiceMode,
 		"updated_at":                      settings.UpdatedAt.Format(time.RFC3339),
 	}
 	if err := s.eventBus.Publish(ctx, events.UserSettingsUpdated, bus.NewEvent(events.UserSettingsUpdated, "user-service", data)); err != nil {
diff --git a/apps/backend/internal/user/service/service_test.go b/apps/backend/internal/user/service/service_test.go
index 1e3a1efdb..4a460d00e 100644
--- a/apps/backend/internal/user/service/service_test.go
+++ b/apps/backend/internal/user/service/service_test.go
@@ -441,3 +441,114 @@ func TestApplySidebarViews(t *testing.T) {
 		})
 	}
 }
+
+func TestApplyVoiceMode(t *testing.T) {
+	t.Run("nil value leaves settings unchanged", func(t *testing.T) {
+		settings := &models.UserSettings{
+			VoiceMode: models.VoiceModeSettings{Engine: "webSpeech", Language: "en-US"},
+		}
+		if err := applyVoiceMode(settings, nil); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if settings.VoiceMode.Engine != "webSpeech" || settings.VoiceMode.Language != "en-US" {
+			t.Fatalf("expected unchanged, got %+v", settings.VoiceMode)
+		}
+	})
+
+	t.Run("happy path: applies a full update", func(t *testing.T) {
+		settings := &models.UserSettings{}
+		err := applyVoiceMode(settings, &models.VoiceModeSettings{
+			Enabled:         true,
+			Engine:          "whisperWeb",
+			Language:        "pt-PT",
+			Mode:            "hold",
+			AutoSend:        true,
+			WhisperWebModel: "small",
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		want := models.VoiceModeSettings{
+			Enabled:         true,
+			Engine:          "whisperWeb",
+			Language:        "pt-PT",
+			Mode:            "hold",
+			AutoSend:        true,
+			WhisperWebModel: "small",
+		}
+		if settings.VoiceMode != want {
+			t.Fatalf("expected %+v, got %+v", want, settings.VoiceMode)
+		}
+	})
+
+	t.Run("enabled=false is honored (user disabled the feature)", func(t *testing.T) {
+		settings := &models.UserSettings{VoiceMode: models.VoiceModeSettings{Enabled: true}}
+		if err := applyVoiceMode(settings, &models.VoiceModeSettings{Enabled: false}); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if settings.VoiceMode.Enabled {
+			t.Fatalf("expected Enabled=false after disable, got true")
+		}
+	})
+
+	t.Run("invalid engine is rejected", func(t *testing.T) {
+		err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Engine: "bogus"})
+		if err == nil || !strings.Contains(err.Error(), "voice_mode.engine") {
+			t.Fatalf("expected engine validation error, got %v", err)
+		}
+	})
+
+	t.Run("invalid mode is rejected", func(t *testing.T) {
+		err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Mode: "tap"})
+		if err == nil || !strings.Contains(err.Error(), "voice_mode.mode") {
+			t.Fatalf("expected mode validation error, got %v", err)
+		}
+	})
+
+	t.Run("invalid whisper_web_model is rejected", func(t *testing.T) {
+		err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{WhisperWebModel: "huge"})
+		if err == nil || !strings.Contains(err.Error(), "voice_mode.whisper_web_model") {
+			t.Fatalf("expected model validation error, got %v", err)
+		}
+	})
+
+	t.Run("partial update preserves string fields but zeroes booleans", func(t *testing.T) {
+		settings := &models.UserSettings{
+			VoiceMode: models.VoiceModeSettings{
+				Enabled:         true,
+				Engine:          "whisperServer",
+				Language:        "en-GB",
+				Mode:            "toggle",
+				AutoSend:        true,
+				WhisperWebModel: "tiny",
+			},
+		}
+		// Empty strings on the new payload mean "no change" for the string fields,
+		// but bools have no "unset" sentinel — every PATCH carries them. The settings
+		// UI always sends the full VoiceMode object so partial updates here would
+		// only happen in test or hand-crafted requests; the assertions below lock in
+		// that explicit behavior so it doesn't drift silently.
+		err := applyVoiceMode(settings, &models.VoiceModeSettings{Engine: "webSpeech"})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if settings.VoiceMode.Engine != "webSpeech" {
+			t.Fatalf("expected engine=webSpeech, got %q", settings.VoiceMode.Engine)
+		}
+		if settings.VoiceMode.Language != "en-GB" {
+			t.Fatalf("expected language preserved, got %q", settings.VoiceMode.Language)
+		}
+		if settings.VoiceMode.Mode != "toggle" {
+			t.Fatalf("expected mode preserved, got %q", settings.VoiceMode.Mode)
+		}
+		if settings.VoiceMode.WhisperWebModel != "tiny" {
+			t.Fatalf("expected whisper model preserved, got %q", settings.VoiceMode.WhisperWebModel)
+		}
+		if settings.VoiceMode.Enabled {
+			t.Fatalf("expected Enabled zeroed on partial update, got true")
+		}
+		if settings.VoiceMode.AutoSend {
+			t.Fatalf("expected AutoSend zeroed on partial update, got true")
+		}
+	})
+}
diff --git a/apps/backend/internal/user/store/sqlite.go b/apps/backend/internal/user/store/sqlite.go
index 954c9f93b..6671caf8c 100644
--- a/apps/backend/internal/user/store/sqlite.go
+++ b/apps/backend/internal/user/store/sqlite.go
@@ -162,6 +162,7 @@ func (r *sqliteRepository) UpsertUserSettings(ctx context.Context, settings *mod
 		"terminal_font_family":            settings.TerminalFontFamily,
 		"terminal_font_size":              settings.TerminalFontSize,
 		"changes_panel_layout":            settings.ChangesPanelLayout,
+		"voice_mode":                      settings.VoiceMode,
 	})
 	if err != nil {
 		return err
@@ -192,6 +193,58 @@ func scanUser(scanner interface{ Scan(dest ...any) error }) (*models.User, error
 	return user, nil
 }
 
+// defaultVoiceModeSettings returns the baseline VoiceMode configuration for
+// users with no saved preferences. Mirrored on the frontend; keep in sync.
+func defaultVoiceModeSettings() models.VoiceModeSettings {
+	return models.VoiceModeSettings{
+		Enabled:         true,
+		Engine:          "auto",
+		Language:        "auto",
+		Mode:            "toggle",
+		AutoSend:        false,
+		WhisperWebModel: "base",
+	}
+}
+
+// storedVoiceMode is the on-disk JSON shape — uses *bool for `enabled` so we
+// can distinguish "absent" (older rows written before the toggle existed —
+// must default to true) from "explicitly false" (user disabled the feature).
+type storedVoiceMode struct {
+	Enabled         *bool  `json:"enabled"`
+	Engine          string `json:"engine"`
+	Language        string `json:"language"`
+	Mode            string `json:"mode"`
+	AutoSend        bool   `json:"auto_send"`
+	WhisperWebModel string `json:"whisper_web_model"`
+}
+
+// mergeVoiceModeDefaults fills in zero/missing fields on a stored VoiceMode
+// payload so older user rows (written before VoiceMode existed) still produce
+// usable settings instead of empty strings the frontend would reject.
+func mergeVoiceModeDefaults(stored *storedVoiceMode) models.VoiceModeSettings {
+	out := defaultVoiceModeSettings()
+	if stored == nil {
+		return out
+	}
+	if stored.Enabled != nil {
+		out.Enabled = *stored.Enabled
+	}
+	if stored.Engine != "" {
+		out.Engine = stored.Engine
+	}
+	if stored.Language != "" {
+		out.Language = stored.Language
+	}
+	if stored.Mode != "" {
+		out.Mode = stored.Mode
+	}
+	if stored.WhisperWebModel != "" {
+		out.WhisperWebModel = stored.WhisperWebModel
+	}
+	out.AutoSend = stored.AutoSend
+	return out
+}
+
 func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID string) (*models.UserSettings, error) {
 	settings := &models.UserSettings{}
 	var settingsRaw string
@@ -208,6 +261,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin
 		settings.TerminalLinkBehavior = "new_tab"
 		settings.ChangesPanelLayout = "flat"
 		settings.SidebarViews = []models.SidebarView{}
+		settings.VoiceMode = defaultVoiceModeSettings()
 		return settings, nil
 	}
 	var payload struct {
@@ -235,6 +289,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin
 		TerminalFontFamily          string                            `json:"terminal_font_family"`
 		TerminalFontSize            int                               `json:"terminal_font_size"`
 		ChangesPanelLayout          string                            `json:"changes_panel_layout"`
+		VoiceMode                   *storedVoiceMode                  `json:"voice_mode"`
 	}
 	if err := json.Unmarshal([]byte(settingsRaw), &payload); err != nil {
 		return nil, err
@@ -294,6 +349,7 @@ func scanUserSettings(scanner interface{ Scan(dest ...any) error }, userID strin
 	}
 	settings.TerminalFontFamily = payload.TerminalFontFamily
 	settings.TerminalFontSize = payload.TerminalFontSize
+	settings.VoiceMode = mergeVoiceModeDefaults(payload.VoiceMode)
 	if payload.ChangesPanelLayout == "tree" {
 		settings.ChangesPanelLayout = "tree"
 	} else {
diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers.go b/apps/backend/internal/voice/handlers/transcribe_handlers.go
new file mode 100644
index 000000000..bb9992e3a
--- /dev/null
+++ b/apps/backend/internal/voice/handlers/transcribe_handlers.go
@@ -0,0 +1,117 @@
+// Package handlers exposes the HTTP surface for the voice-input transcription
+// fallback. The endpoint is unauthenticated (matches /api/v1/features) — the
+// Web Speech API path is preferred by the frontend, so this server-side
+// fallback only runs when the browser cannot do it locally.
+package handlers
+
+import (
+	"errors"
+	"io"
+	"net/http"
+
+	"github.com/gin-gonic/gin"
+	"go.uber.org/zap"
+
+	"github.com/kandev/kandev/internal/common/logger"
+	"github.com/kandev/kandev/internal/voice/transcribe"
+)
+
+// maxAudioBytes caps the multipart audio payload. Whisper accepts up to 25 MB
+// per request; we cap lower so a stuck mic doesn't blow up backend memory or
+// burn API spend on a stuck recording.
+const maxAudioBytes = 10 * 1024 * 1024
+
+// Handlers wires the transcribe service into Gin routes.
+type Handlers struct {
+	svc *transcribe.Service
+	log *logger.Logger
+}
+
+// NewHandlers constructs a Handlers from a transcribe Service.
+func NewHandlers(svc *transcribe.Service, log *logger.Logger) *Handlers {
+	return &Handlers{
+		svc: svc,
+		log: log.WithFields(zap.String("component", "voice-handlers")),
+	}
+}
+
+// RegisterRoutes mounts the voice transcription endpoint.
+func RegisterRoutes(router *gin.Engine, svc *transcribe.Service, log *logger.Logger) {
+	h := NewHandlers(svc, log)
+	api := router.Group("/api/v1")
+	api.POST("/transcribe", h.httpTranscribe)
+}
+
+func (h *Handlers) httpTranscribe(c *gin.Context) {
+	if h.svc == nil || !h.svc.Configured() {
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error": "voice transcription is not configured on this server",
+		})
+		return
+	}
+
+	// MaxBytesReader caps multipart parsing — once the cap is exceeded, Gin's
+	// multipart parser surfaces *http.MaxBytesError out of c.FormFile (because
+	// it reads the whole body through the wrapped reader before we ever get
+	// the *FileHeader). We need to distinguish that case from a genuinely
+	// missing field so the client sees 413 instead of a misleading 400.
+	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, maxAudioBytes)
+
+	fh, err := c.FormFile("audio")
+	if err != nil {
+		var maxBytesErr *http.MaxBytesError
+		if errors.As(err, &maxBytesErr) {
+			c.JSON(http.StatusRequestEntityTooLarge, gin.H{"error": "audio payload too large"})
+			return
+		}
+		c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is required (multipart field 'audio')"})
+		return
+	}
+
+	file, err := fh.Open()
+	if err != nil {
+		h.log.Warn("open uploaded audio failed", zap.Error(err))
+		c.JSON(http.StatusBadRequest, gin.H{"error": "cannot open uploaded audio"})
+		return
+	}
+	defer func() { _ = file.Close() }()
+
+	data, err := io.ReadAll(file)
+	if err != nil {
+		h.log.Warn("read uploaded audio failed", zap.Error(err))
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read uploaded audio"})
+		return
+	}
+	if len(data) == 0 {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "audio file is empty"})
+		return
+	}
+
+	mime := fh.Header.Get("Content-Type")
+	text, err := h.svc.Transcribe(c.Request.Context(), data, mime, fh.Filename)
+	if err != nil {
+		h.respondError(c, err)
+		return
+	}
+	c.JSON(http.StatusOK, gin.H{"text": text})
+}
+
+func (h *Handlers) respondError(c *gin.Context, err error) {
+	if errors.Is(err, transcribe.ErrNotConfigured) {
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error": "voice transcription is not configured on this server",
+		})
+		return
+	}
+	var upstream *transcribe.UpstreamError
+	if errors.As(err, &upstream) {
+		h.log.Warn("whisper upstream error",
+			zap.Int("status", upstream.StatusCode),
+			zap.String("body", upstream.Body),
+		)
+		c.JSON(http.StatusBadGateway, gin.H{"error": "upstream transcription error"})
+		return
+	}
+	h.log.Error("transcription failed", zap.Error(err))
+	c.JSON(http.StatusInternalServerError, gin.H{"error": "transcription failed"})
+}
diff --git a/apps/backend/internal/voice/handlers/transcribe_handlers_test.go b/apps/backend/internal/voice/handlers/transcribe_handlers_test.go
new file mode 100644
index 000000000..aa3170578
--- /dev/null
+++ b/apps/backend/internal/voice/handlers/transcribe_handlers_test.go
@@ -0,0 +1,157 @@
+package handlers
+
+import (
+	"bytes"
+	"encoding/json"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+
+	"github.com/gin-gonic/gin"
+
+	"github.com/kandev/kandev/internal/common/logger"
+	"github.com/kandev/kandev/internal/voice/transcribe"
+)
+
+func init() {
+	gin.SetMode(gin.TestMode)
+}
+
+func testLogger(t *testing.T) *logger.Logger {
+	t.Helper()
+	log, err := logger.NewLogger(logger.LoggingConfig{Level: "error", Format: "text", OutputPath: "stderr"})
+	if err != nil {
+		t.Fatalf("logger.NewLogger: %v", err)
+	}
+	return log
+}
+
+func buildAudioRequest(t *testing.T, field, filename, mime string, data []byte) (*http.Request, string) {
+	t.Helper()
+	buf := &bytes.Buffer{}
+	w := multipart.NewWriter(buf)
+	if data != nil {
+		fw, err := createFormFile(w, field, filename, mime)
+		if err != nil {
+			t.Fatal(err)
+		}
+		_, _ = fw.Write(data)
+	}
+	_ = w.Close()
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/transcribe", buf)
+	req.Header.Set("Content-Type", w.FormDataContentType())
+	return req, w.FormDataContentType()
+}
+
+func createFormFile(w *multipart.Writer, field, filename, mime string) (interface{ Write([]byte) (int, error) }, error) {
+	if mime == "" {
+		return w.CreateFormFile(field, filename)
+	}
+	hdr := make(map[string][]string)
+	hdr["Content-Disposition"] = []string{"form-data; name=\"" + field + "\"; filename=\"" + filename + "\""}
+	hdr["Content-Type"] = []string{mime}
+	return w.CreatePart(hdr)
+}
+
+func newRouter(svc *transcribe.Service, t *testing.T) *gin.Engine {
+	r := gin.New()
+	RegisterRoutes(r, svc, testLogger(t))
+	return r
+}
+
+func TestTranscribe_NotConfigured(t *testing.T) {
+	svc := transcribe.New("")
+	r := newRouter(svc, t)
+
+	req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte("hello"))
+	w := httptest.NewRecorder()
+	r.ServeHTTP(w, req)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Fatalf("status = %d, want 503; body=%s", w.Code, w.Body.String())
+	}
+}
+
+func TestTranscribe_MissingFile(t *testing.T) {
+	svc := transcribe.New("sk-test")
+	r := newRouter(svc, t)
+
+	// No file part — just an empty form.
+	buf := &bytes.Buffer{}
+	w := multipart.NewWriter(buf)
+	_ = w.Close()
+	req := httptest.NewRequest(http.MethodPost, "/api/v1/transcribe", buf)
+	req.Header.Set("Content-Type", w.FormDataContentType())
+
+	rr := httptest.NewRecorder()
+	r.ServeHTTP(rr, req)
+	if rr.Code != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400; body=%s", rr.Code, rr.Body.String())
+	}
+}
+
+func TestTranscribe_EmptyAudio(t *testing.T) {
+	svc := transcribe.New("sk-test")
+	r := newRouter(svc, t)
+
+	req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte{})
+	rr := httptest.NewRecorder()
+	r.ServeHTTP(rr, req)
+	if rr.Code != http.StatusBadRequest {
+		t.Fatalf("status = %d, want 400; body=%s", rr.Code, rr.Body.String())
+	}
+}
+
+func TestTranscribe_Success(t *testing.T) {
+	upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.Header.Get("Authorization") != "Bearer sk-test" {
+			t.Errorf("auth header missing")
+		}
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(`{"text":"transcribed"}`))
+	}))
+	defer upstream.Close()
+
+	svc := transcribe.New("sk-test", transcribe.WithEndpoint(upstream.URL))
+	r := newRouter(svc, t)
+
+	req, _ := buildAudioRequest(t, "audio", "clip.webm", "audio/webm", []byte("bytes"))
+	rr := httptest.NewRecorder()
+	r.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusOK {
+		t.Fatalf("status = %d, want 200; body=%s", rr.Code, rr.Body.String())
+	}
+	var body struct {
+		Text string `json:"text"`
+	}
+	if err := json.Unmarshal(rr.Body.Bytes(), &body); err != nil {
+		t.Fatal(err)
+	}
+	if body.Text != "transcribed" {
+		t.Errorf("text = %q", body.Text)
+	}
+}
+
+func TestTranscribe_UpstreamError(t *testing.T) {
+	upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusBadGateway)
+		_, _ = w.Write([]byte(`{"error":"oops"}`))
+	}))
+	defer upstream.Close()
+
+	svc := transcribe.New("sk-test", transcribe.WithEndpoint(upstream.URL))
+	r := newRouter(svc, t)
+	req, _ := buildAudioRequest(t, "audio", "a.webm", "audio/webm", []byte("bytes"))
+	rr := httptest.NewRecorder()
+	r.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusBadGateway {
+		t.Fatalf("status = %d, want 502; body=%s", rr.Code, rr.Body.String())
+	}
+	if !strings.Contains(rr.Body.String(), "upstream") {
+		t.Errorf("body should mention upstream: %s", rr.Body.String())
+	}
+}
diff --git a/apps/backend/internal/voice/transcribe/service.go b/apps/backend/internal/voice/transcribe/service.go
new file mode 100644
index 000000000..fad7b13b9
--- /dev/null
+++ b/apps/backend/internal/voice/transcribe/service.go
@@ -0,0 +1,185 @@
+// Package transcribe wraps the OpenAI Whisper transcription endpoint for the
+// chat voice-input fallback. The browser's Web Speech API is the primary
+// voice-input engine; this server-side path is only hit when the browser
+// has no SpeechRecognition support.
+package transcribe
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"net/textproto"
+	"strings"
+	"time"
+)
+
+// ErrNotConfigured is returned when the service has no API key — the handler
+// maps this to HTTP 503 so the frontend can hide the Whisper fallback path
+// instead of repeatedly retrying a deployment that will never succeed.
+var ErrNotConfigured = errors.New("voice transcription is not configured")
+
+// UpstreamError wraps a non-2xx response from OpenAI so the handler can map
+// it to HTTP 502 and surface a clean error to the caller.
+type UpstreamError struct {
+	StatusCode int
+	Body       string
+}
+
+func (e *UpstreamError) Error() string {
+	return fmt.Sprintf("openai whisper upstream error: status=%d body=%s", e.StatusCode, e.Body)
+}
+
+const (
+	defaultEndpoint = "https://api.openai.com/v1/audio/transcriptions"
+	defaultModel    = "whisper-1"
+	defaultTimeout  = 60 * time.Second
+)
+
+// Service transcribes audio via OpenAI's Whisper endpoint.
+type Service struct {
+	apiKey   string
+	endpoint string
+	model    string
+	client   *http.Client
+}
+
+// Option customises a Service for tests (custom endpoint, HTTP client).
+type Option func(*Service)
+
+// WithEndpoint overrides the upstream URL — used by tests with httptest servers.
+func WithEndpoint(url string) Option {
+	return func(s *Service) { s.endpoint = url }
+}
+
+// WithHTTPClient overrides the HTTP client.
+func WithHTTPClient(c *http.Client) Option {
+	return func(s *Service) { s.client = c }
+}
+
+// WithModel overrides the Whisper model name.
+func WithModel(model string) Option {
+	return func(s *Service) { s.model = model }
+}
+
+// New constructs a Service. apiKey may be empty; in that case Transcribe
+// returns ErrNotConfigured without making any network calls.
+func New(apiKey string, opts ...Option) *Service {
+	s := &Service{
+		apiKey:   apiKey,
+		endpoint: defaultEndpoint,
+		model:    defaultModel,
+		client:   &http.Client{Timeout: defaultTimeout},
+	}
+	for _, o := range opts {
+		o(s)
+	}
+	return s
+}
+
+// Configured reports whether the service has an API key. Used by handlers
+// to short-circuit before reading the request body.
+func (s *Service) Configured() bool {
+	return s != nil && strings.TrimSpace(s.apiKey) != ""
+}
+
+// Transcribe sends the given audio bytes to OpenAI Whisper and returns the
+// transcribed text. filename is used for the multipart Content-Disposition;
+// Whisper relies on the file extension to detect the audio format.
+func (s *Service) Transcribe(ctx context.Context, audio []byte, mimeType, filename string) (string, error) {
+	if !s.Configured() {
+		return "", ErrNotConfigured
+	}
+	if len(audio) == 0 {
+		return "", errors.New("audio payload is empty")
+	}
+
+	body, contentType, err := buildMultipart(audio, mimeType, filename, s.model)
+	if err != nil {
+		return "", fmt.Errorf("build multipart body: %w", err)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, s.endpoint, body)
+	if err != nil {
+		return "", fmt.Errorf("build whisper request: %w", err)
+	}
+	req.Header.Set("Authorization", "Bearer "+s.apiKey)
+	req.Header.Set("Content-Type", contentType)
+	req.Header.Set("Accept", "application/json")
+
+	resp, err := s.client.Do(req)
+	if err != nil {
+		return "", fmt.Errorf("call whisper endpoint: %w", err)
+	}
+	defer func() { _ = resp.Body.Close() }()
+
+	rawBody, _ := io.ReadAll(resp.Body)
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return "", &UpstreamError{StatusCode: resp.StatusCode, Body: string(rawBody)}
+	}
+
+	var parsed struct {
+		Text string `json:"text"`
+	}
+	if err := json.Unmarshal(rawBody, &parsed); err != nil {
+		return "", fmt.Errorf("decode whisper response: %w", err)
+	}
+	return strings.TrimSpace(parsed.Text), nil
+}
+
+// buildMultipart assembles the multipart/form-data body Whisper expects:
+// `file`, `model`, and `response_format=json`.
+func buildMultipart(audio []byte, mimeType, filename, model string) (io.Reader, string, error) {
+	buf := &bytes.Buffer{}
+	w := multipart.NewWriter(buf)
+
+	if filename == "" {
+		filename = "recording" + extensionForMime(mimeType)
+	}
+	header := textproto.MIMEHeader{}
+	header.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename=%q`, filename))
+	if mimeType != "" {
+		header.Set("Content-Type", mimeType)
+	}
+	filePart, err := w.CreatePart(header)
+	if err != nil {
+		return nil, "", err
+	}
+	if _, err := filePart.Write(audio); err != nil {
+		return nil, "", err
+	}
+
+	if err := w.WriteField("model", model); err != nil {
+		return nil, "", err
+	}
+	if err := w.WriteField("response_format", "json"); err != nil {
+		return nil, "", err
+	}
+	if err := w.Close(); err != nil {
+		return nil, "", err
+	}
+	return buf, w.FormDataContentType(), nil
+}
+
+// extensionForMime maps the audio MIME types MediaRecorder commonly emits to
+// the file extensions Whisper recognises. Default to ".webm" — supported by
+// Whisper and the most common MediaRecorder default on Chrome.
+func extensionForMime(mime string) string {
+	mime = strings.ToLower(mime)
+	switch {
+	case strings.Contains(mime, "wav"):
+		return ".wav"
+	case strings.Contains(mime, "mp4"), strings.Contains(mime, "m4a"):
+		return ".m4a"
+	case strings.Contains(mime, "mpeg"), strings.Contains(mime, "mp3"):
+		return ".mp3"
+	case strings.Contains(mime, "ogg"):
+		return ".ogg"
+	default:
+		return ".webm"
+	}
+}
diff --git a/apps/backend/internal/voice/transcribe/service_test.go b/apps/backend/internal/voice/transcribe/service_test.go
new file mode 100644
index 000000000..b8c8315cf
--- /dev/null
+++ b/apps/backend/internal/voice/transcribe/service_test.go
@@ -0,0 +1,222 @@
+package transcribe
+
+import (
+	"context"
+	"errors"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+func TestService_Transcribe_NotConfigured(t *testing.T) {
+	svc := New("")
+	_, err := svc.Transcribe(context.Background(), []byte("data"), "audio/webm", "")
+	if !errors.Is(err, ErrNotConfigured) {
+		t.Fatalf("expected ErrNotConfigured, got %v", err)
+	}
+}
+
+func TestService_Configured(t *testing.T) {
+	if New("").Configured() {
+		t.Errorf("empty key should not be configured")
+	}
+	if New("   ").Configured() {
+		t.Errorf("whitespace-only key should not be configured")
+	}
+	if !New("sk-test").Configured() {
+		t.Errorf("non-empty key should be configured")
+	}
+}
+
+func TestService_Transcribe_EmptyAudio(t *testing.T) {
+	svc := New("sk-test")
+	_, err := svc.Transcribe(context.Background(), nil, "audio/webm", "")
+	if err == nil {
+		t.Fatal("expected error for empty audio")
+	}
+}
+
+func TestService_Transcribe_Success(t *testing.T) {
+	var capturedAuth string
+	var capturedFilename string
+	var capturedFileBytes []byte
+	var capturedModel string
+	var capturedFormat string
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		capturedAuth = r.Header.Get("Authorization")
+		if err := r.ParseMultipartForm(32 << 20); err != nil {
+			t.Errorf("parse multipart: %v", err)
+		}
+		capturedModel = r.FormValue("model")
+		capturedFormat = r.FormValue("response_format")
+		// Use Errorf + return inside the HTTP handler goroutine — t.Fatalf
+		// from a non-test goroutine triggers FailNow which panics rather than
+		// failing the test cleanly.
+		fh := r.MultipartForm.File["file"]
+		if len(fh) != 1 {
+			t.Errorf("expected 1 file part, got %d", len(fh))
+			return
+		}
+		capturedFilename = fh[0].Filename
+		f, err := fh[0].Open()
+		if err != nil {
+			t.Errorf("open file: %v", err)
+			return
+		}
+		defer func() { _ = f.Close() }()
+		capturedFileBytes, _ = io.ReadAll(f)
+		w.Header().Set("Content-Type", "application/json")
+		_, _ = w.Write([]byte(`{"text":"hello world"}`))
+	}))
+	defer srv.Close()
+
+	svc := New("sk-test", WithEndpoint(srv.URL))
+	text, err := svc.Transcribe(context.Background(), []byte("audio-bytes"), "audio/webm", "clip.webm")
+	if err != nil {
+		t.Fatalf("Transcribe failed: %v", err)
+	}
+	if text != "hello world" {
+		t.Errorf("unexpected text: %q", text)
+	}
+	if capturedAuth != "Bearer sk-test" {
+		t.Errorf("auth header = %q", capturedAuth)
+	}
+	if capturedModel != defaultModel {
+		t.Errorf("model = %q", capturedModel)
+	}
+	if capturedFormat != "json" {
+		t.Errorf("response_format = %q", capturedFormat)
+	}
+	if capturedFilename != "clip.webm" {
+		t.Errorf("filename = %q", capturedFilename)
+	}
+	if string(capturedFileBytes) != "audio-bytes" {
+		t.Errorf("file body = %q", string(capturedFileBytes))
+	}
+}
+
+func TestService_Transcribe_DerivedFilename(t *testing.T) {
+	var capturedFilename string
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		_ = r.ParseMultipartForm(32 << 20)
+		fh := r.MultipartForm.File["file"]
+		if len(fh) == 1 {
+			capturedFilename = fh[0].Filename
+		}
+		_, _ = w.Write([]byte(`{"text":""}`))
+	}))
+	defer srv.Close()
+
+	svc := New("sk-test", WithEndpoint(srv.URL))
+	_, err := svc.Transcribe(context.Background(), []byte("a"), "audio/wav", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasSuffix(capturedFilename, ".wav") {
+		t.Errorf("derived filename should use .wav for audio/wav, got %q", capturedFilename)
+	}
+}
+
+func TestService_Transcribe_UpstreamError(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusBadRequest)
+		_, _ = w.Write([]byte(`{"error":"bad audio"}`))
+	}))
+	defer srv.Close()
+
+	svc := New("sk-test", WithEndpoint(srv.URL))
+	_, err := svc.Transcribe(context.Background(), []byte("a"), "audio/webm", "")
+	var upstream *UpstreamError
+	if !errors.As(err, &upstream) {
+		t.Fatalf("expected UpstreamError, got %T: %v", err, err)
+	}
+	if upstream.StatusCode != http.StatusBadRequest {
+		t.Errorf("status = %d", upstream.StatusCode)
+	}
+	if !strings.Contains(upstream.Body, "bad audio") {
+		t.Errorf("body did not contain upstream payload: %q", upstream.Body)
+	}
+}
+
+func TestExtensionForMime(t *testing.T) {
+	cases := map[string]string{
+		"audio/webm":           ".webm",
+		"audio/wav":            ".wav",
+		"audio/x-wav":          ".wav",
+		"audio/mp4":            ".m4a",
+		"audio/m4a":            ".m4a",
+		"audio/mpeg":           ".mp3",
+		"audio/mp3":            ".mp3",
+		"audio/ogg":            ".ogg",
+		"":                     ".webm",
+		"application/anything": ".webm",
+	}
+	for mime, want := range cases {
+		if got := extensionForMime(mime); got != want {
+			t.Errorf("extensionForMime(%q) = %q, want %q", mime, got, want)
+		}
+	}
+}
+
+func TestBuildMultipart_Roundtrip(t *testing.T) {
+	body, ct, err := buildMultipart([]byte("hello"), "audio/wav", "a.wav", "whisper-1")
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Parse the multipart body back out using the boundary embedded in ct.
+	mediaType, params, ok := splitContentType(ct)
+	if !ok || mediaType != "multipart/form-data" {
+		t.Fatalf("unexpected content-type: %q", ct)
+	}
+	mr := multipart.NewReader(body, params["boundary"])
+	fields := map[string]string{}
+	var fileContent string
+	for {
+		part, err := mr.NextPart()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			t.Fatal(err)
+		}
+		buf, _ := io.ReadAll(part)
+		if part.FileName() != "" {
+			fileContent = string(buf)
+		} else {
+			fields[part.FormName()] = string(buf)
+		}
+	}
+	if fileContent != "hello" {
+		t.Errorf("file part = %q", fileContent)
+	}
+	if fields["model"] != "whisper-1" {
+		t.Errorf("model field = %q", fields["model"])
+	}
+	if fields["response_format"] != "json" {
+		t.Errorf("response_format field = %q", fields["response_format"])
+	}
+}
+
+// splitContentType is a tiny helper to split "multipart/form-data; boundary=…"
+// without pulling in mime.ParseMediaType — keeps this test file self-contained.
+func splitContentType(ct string) (string, map[string]string, bool) {
+	parts := strings.SplitN(ct, ";", 2)
+	if len(parts) != 2 {
+		return "", nil, false
+	}
+	mediaType := strings.TrimSpace(parts[0])
+	params := map[string]string{}
+	for _, kv := range strings.Split(parts[1], ";") {
+		kv = strings.TrimSpace(kv)
+		eq := strings.IndexByte(kv, '=')
+		if eq < 0 {
+			continue
+		}
+		params[kv[:eq]] = strings.Trim(kv[eq+1:], `"`)
+	}
+	return mediaType, params, true
+}
diff --git a/apps/pnpm-lock.yaml b/apps/pnpm-lock.yaml
index e773c9190..8120436bf 100644
--- a/apps/pnpm-lock.yaml
+++ b/apps/pnpm-lock.yaml
@@ -246,6 +246,9 @@ importers:
       '@dnd-kit/utilities':
         specifier: ^3.2.2
         version: 3.2.2(react@19.2.3)
+      '@huggingface/transformers':
+        specifier: ^4.2.0
+        version: 4.2.0
       '@kandev/theme':
         specifier: workspace:*
         version: link:../packages/theme
@@ -419,7 +422,7 @@ importers:
         version: 0.55.1
       next:
         specifier: 16.1.7
-        version: 16.1.7(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3)
+        version: 16.1.7(@babel/core@7.28.6)(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3)
       next-themes:
         specifier: ^0.4.6
         version: 0.4.6(react-dom@19.2.3(react@19.2.3))(react@19.2.3)
@@ -473,7 +476,7 @@ importers:
         version: 2.0.7(react-dom@19.2.3(react@19.2.3))(react@19.2.3)
       styled-jsx:
         specifier: 5.1.6
-        version: 5.1.6(react@19.2.3)
+        version: 5.1.6(@babel/core@7.28.6)(react@19.2.3)
       tailwind-merge:
         specifier: ^3.4.0
         version: 3.4.0
@@ -1406,6 +1409,16 @@ packages:
     peerDependencies:
       hono: ^4
 
+  '@huggingface/jinja@0.5.9':
+    resolution: {integrity: sha512-uWTG+l3VJRsl7EXxYizuL3P+cCPoc3cRqbWWRcQN0FhejRfbdq0RNhCmbY/YDtnTcz9icdLYuLDjsnz4d8JMuw==}
+    engines: {node: '>=18'}
+
+  '@huggingface/tokenizers@0.1.3':
+    resolution: {integrity: sha512-8rF/RRT10u+kn7YuUbUg0OF30K8rjTc78aHpxT+qJ1uWSqxT1MHi8+9ltwYfkFYJzT/oS+qw3JVfHtNMGAdqyA==}
+
+  '@huggingface/transformers@4.2.0':
+    resolution: {integrity: sha512-8BRCoBMH0XsWaEIamuR0LrJGAfftgHAfb2Vrffy0VKlSAE/MnUJ5/h/zTfEP3fDIft+nk7TqB8xXEyABGitBjQ==}
+
   '@humanfs/core@0.19.1':
     resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==}
     engines: {node: '>=18.18.0'}
@@ -1845,6 +1858,36 @@ packages:
     engines: {node: '>=18'}
     hasBin: true
 
+  '@protobufjs/aspromise@1.1.2':
+    resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
+
+  '@protobufjs/base64@1.1.2':
+    resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
+
+  '@protobufjs/codegen@2.0.5':
+    resolution: {integrity: sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==}
+
+  '@protobufjs/eventemitter@1.1.1':
+    resolution: {integrity: sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==}
+
+  '@protobufjs/fetch@1.1.1':
+    resolution: {integrity: sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==}
+
+  '@protobufjs/float@1.0.2':
+    resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
+
+  '@protobufjs/inquire@1.1.2':
+    resolution: {integrity: sha512-pa0vFRuws4wkvaXKK1uXZMAwAX4/t8ANaJo45iw/oQHNQ9q5xUzwgFmVJGXiga2BeN+zpX7Vf9vmsiIa2J+MUw==}
+
+  '@protobufjs/path@1.1.2':
+    resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
+
+  '@protobufjs/pool@1.1.0':
+    resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
+
+  '@protobufjs/utf8@1.1.1':
+    resolution: {integrity: sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==}
+
   '@radix-ui/number@1.1.1':
     resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==}
 
@@ -3423,6 +3466,7 @@ packages:
 
   '@ungap/structured-clone@1.3.0':
     resolution: {integrity: sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==}
+    deprecated: Potential CWE-502 - Update to 1.3.1 or higher
 
   '@unrs/resolver-binding-android-arm-eabi@1.11.1':
     resolution: {integrity: sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==}
@@ -3575,6 +3619,10 @@ packages:
     engines: {node: '>=0.4.0'}
     hasBin: true
 
+  adm-zip@0.5.17:
+    resolution: {integrity: sha512-+Ut8d9LLqwEvHHJl1+PIHqoyDxFgVN847JTVM3Izi3xHDWPE4UtzzXysMZQs64DMcrJfBeS/uoEP4AD3HQHnQQ==}
+    engines: {node: '>=12.0'}
+
   agent-base@7.1.4:
     resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==}
     engines: {node: '>= 14'}
@@ -3707,6 +3755,10 @@ packages:
     resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==}
     engines: {node: '>=18'}
 
+  boolean@3.2.0:
+    resolution: {integrity: sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw==}
+    deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info.
+
   brace-expansion@1.1.12:
     resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==}
 
@@ -4226,6 +4278,9 @@ packages:
   detect-node-es@1.1.0:
     resolution: {integrity: sha512-ypdmJU/TbBby2Dxibuv7ZLW3Bs1QEmM7nHjEANfohJLvE0XVujisn1qPJcZxg+qDucsr+bP6fLD1rPS3AhJ7EQ==}
 
+  detect-node@2.1.0:
+    resolution: {integrity: sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==}
+
   devlop@1.1.0:
     resolution: {integrity: sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==}
 
@@ -4367,6 +4422,9 @@ packages:
     resolution: {integrity: sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==}
     engines: {node: '>= 0.4'}
 
+  es6-error@4.1.1:
+    resolution: {integrity: sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg==}
+
   esbuild@0.21.5:
     resolution: {integrity: sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==}
     engines: {node: '>=12'}
@@ -4642,6 +4700,9 @@ packages:
     resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==}
     engines: {node: '>=16'}
 
+  flatbuffers@25.9.23:
+    resolution: {integrity: sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==}
+
   flatted@3.3.3:
     resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==}
 
@@ -4765,6 +4826,10 @@ packages:
     resolution: {integrity: sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==}
     engines: {node: '>=10.13.0'}
 
+  global-agent@3.0.0:
+    resolution: {integrity: sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==}
+    engines: {node: '>=10.0'}
+
   global-directory@4.0.1:
     resolution: {integrity: sha512-wHTUcDUoZ1H5/0iVqEudYW4/kAlN5cZ3j/bXn0Dpbizl9iaUVeWSHqiOjsgk6OW2bkLclbBjzewBz6weQ1zA2Q==}
     engines: {node: '>=18'}
@@ -4792,6 +4857,9 @@ packages:
     resolution: {integrity: sha512-DKKrynuQRne0PNpEbzuEdHlYOMksHSUI8Zc9Unei5gTsMNA2/vMpoMz/yKba50pejK56qj98qM0SjYxAKi13gQ==}
     engines: {node: ^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0}
 
+  guid-typescript@1.0.9:
+    resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==}
+
   hachure-fill@0.5.2:
     resolution: {integrity: sha512-3GKBOn+m2LX9iq+JC1064cSFprJY4jL1jCXTcpnfER5HYE2l/4EfWSGzkPa/ZDBmYI0ZOEj5VHV/eKnPGkHuOg==}
 
@@ -5200,6 +5268,9 @@ packages:
   json-stable-stringify-without-jsonify@1.0.1:
     resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==}
 
+  json-stringify-safe@5.0.1:
+    resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==}
+
   json5@1.0.2:
     resolution: {integrity: sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==}
     hasBin: true
@@ -5385,6 +5456,9 @@ packages:
     resolution: {integrity: sha512-i24m8rpwhmPIS4zscNzK6MSEhk0DUWa/8iYQWxhffV8jkI4Phvs3F+quL5xvS0gdQR0FyTCMMH33Y78dDTzzIw==}
     engines: {node: '>=18'}
 
+  long@5.3.2:
+    resolution: {integrity: sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==}
+
   longest-streak@3.1.0:
     resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==}
 
@@ -5431,6 +5505,10 @@ packages:
     engines: {node: '>= 20'}
     hasBin: true
 
+  matcher@3.0.0:
+    resolution: {integrity: sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng==}
+    engines: {node: '>=10'}
+
   math-intrinsics@1.1.0:
     resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
     engines: {node: '>= 0.4'}
@@ -5798,6 +5876,19 @@ packages:
   oniguruma-to-es@4.3.4:
     resolution: {integrity: sha512-3VhUGN3w2eYxnTzHn+ikMI+fp/96KoRSVK9/kMTcFqj1NRDh2IhQCKvYxDnWePKRXY/AqH+Fuiyb7VHSzBjHfA==}
 
+  onnxruntime-common@1.24.0-dev.20251116-b39e144322:
+    resolution: {integrity: sha512-BOoomdHYmNRL5r4iQ4bMvsl2t0/hzVQ3OM3PHD0gxeXu1PmggqBv3puZicEUVOA3AtHHYmqZtjMj9FOfGrATTw==}
+
+  onnxruntime-common@1.24.3:
+    resolution: {integrity: sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA==}
+
+  onnxruntime-node@1.24.3:
+    resolution: {integrity: sha512-JH7+czbc8ALA819vlTgcV+Q214/+VjGeBHDjX81+ZCD0PCVCIFGFNtT0V4sXG/1JXypKPgScQcB3ij/hk3YnTg==}
+    os: [win32, darwin, linux]
+
+  onnxruntime-web@1.26.0-dev.20260416-b7804b056c:
+    resolution: {integrity: sha512-MD6Ss4GSpQBo6zqoJzyT9LRbKYs7x/JVN23FT24EcEvlqF4VuzPOeH6X38orZPKHQDbprn7K+SBpu0/mj2CQiw==}
+
   open@11.0.0:
     resolution: {integrity: sha512-smsWv2LzFjP03xmvFoJ331ss6h+jixfA4UUV/Bsiyuu4YJPfN+FIQGOIiv4w9/+MoHkfkJ22UIaQWRVFRfH6Vw==}
     engines: {node: '>=20'}
@@ -5911,6 +6002,9 @@ packages:
   pkg-types@1.3.1:
     resolution: {integrity: sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==}
 
+  platform@1.3.6:
+    resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==}
+
   playwright-core@1.58.2:
     resolution: {integrity: sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==}
     engines: {node: '>=18'}
@@ -6036,6 +6130,10 @@ packages:
   prosemirror-view@1.41.5:
     resolution: {integrity: sha512-UDQbIPnDrjE8tqUBbPmCOZgtd75htE6W3r0JCmY9bL6W1iemDM37MZEKC49d+tdQ0v/CKx4gjxLoLsfkD2NiZA==}
 
+  protobufjs@7.6.1:
+    resolution: {integrity: sha512-4K0myLaWL5EteuSAro91EGFgcfVgxb64Jx+7oDAY6GOkXD4M69yuSEljNcInGVCA5sOPxmZ/EqDLj2x0Q0+Ygg==}
+    engines: {node: '>=12.0.0'}
+
   proxy-addr@2.0.7:
     resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==}
     engines: {node: '>= 0.10'}
@@ -6170,6 +6268,7 @@ packages:
   recharts@2.15.4:
     resolution: {integrity: sha512-UT/q6fwS3c1dHbXv2uFgYJ9BMFHu3fwnd7AYZaEQhXuYQ4hgsxLvsUXzGdKeZrW5xopzDCvuA2N41WJ88I7zIw==}
     engines: {node: '>=14'}
+    deprecated: 1.x and 2.x branches are no longer active. Bump to Recharts v3 to receive latest features and bugfixes. See https://github.com/recharts/recharts/wiki/3.0-migration-guide
     peerDependencies:
       react: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
       react-dom: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0
@@ -6265,6 +6364,10 @@ packages:
     resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==}
     engines: {iojs: '>=1.0.0', node: '>=0.10.0'}
 
+  roarr@2.15.4:
+    resolution: {integrity: sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A==}
+    engines: {node: '>=8.0'}
+
   robust-predicates@3.0.2:
     resolution: {integrity: sha512-IXgzBWvWQwE6PrDI05OvmXUIruQTcoMDzRsOd5CDvHCVLcLHMTSYvOK5Cm46kWqlV3yAbuSpBZdJ5oP5OUoStg==}
 
@@ -6315,6 +6418,9 @@ packages:
     resolution: {integrity: sha512-3A6sD0WYP7+QrjbfNA2FN3FsOaGGFoekCVgTyypy53gPxhbkCIjtO6YWgdrfM+n/8sI8JeXZOIxsHjMTNxQ4nQ==}
     engines: {node: ^14.0.0 || >=16.0.0}
 
+  semver-compare@1.0.0:
+    resolution: {integrity: sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow==}
+
   semver@6.3.1:
     resolution: {integrity: sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==}
     hasBin: true
@@ -6328,6 +6434,10 @@ packages:
     resolution: {integrity: sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==}
     engines: {node: '>= 18'}
 
+  serialize-error@7.0.1:
+    resolution: {integrity: sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw==}
+    engines: {node: '>=10'}
+
   serve-static@2.2.1:
     resolution: {integrity: sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==}
     engines: {node: '>= 18'}
@@ -6416,6 +6526,9 @@ packages:
     resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==}
     engines: {node: '>= 10.x'}
 
+  sprintf-js@1.1.3:
+    resolution: {integrity: sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==}
+
   stable-hash@0.0.5:
     resolution: {integrity: sha512-+L3ccpzibovGXFK+Ap/f8LOS0ahMrHTf3xu7mMLSpEGU0EO9ucaysSylKo9eRDFNhWve/y275iPmIZ4z39a9iA==}
 
@@ -6664,6 +6777,10 @@ packages:
     resolution: {integrity: sha512-Acylog8/luQ8L7il+geoSxhEkazvkslg7PSNKOX59mbB9cOveP5aq9h74Y7YU8yDpJwetzQQrfIwtf4Wp4LKcw==}
     engines: {node: '>=4'}
 
+  type-fest@0.13.1:
+    resolution: {integrity: sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==}
+    engines: {node: '>=10'}
+
   type-fest@5.4.0:
     resolution: {integrity: sha512-wfkA6r0tBpVfGiyO+zbf9e10QkRQSlK9F2UvyfnjoCmrvH2bjHyhPzhugSBOuq1dog3P0+FKckqe+Xf6WKVjwg==}
     engines: {node: '>=20'}
@@ -7914,6 +8031,18 @@ snapshots:
     dependencies:
       hono: 4.11.3
 
+  '@huggingface/jinja@0.5.9': {}
+
+  '@huggingface/tokenizers@0.1.3': {}
+
+  '@huggingface/transformers@4.2.0':
+    dependencies:
+      '@huggingface/jinja': 0.5.9
+      '@huggingface/tokenizers': 0.1.3
+      onnxruntime-node: 1.24.3
+      onnxruntime-web: 1.26.0-dev.20260416-b7804b056c
+      sharp: 0.34.5
+
   '@humanfs/core@0.19.1': {}
 
   '@humanfs/node@0.16.7':
@@ -7933,8 +8062,7 @@ snapshots:
       '@iconify/types': 2.0.0
       mlly: 1.8.0
 
-  '@img/colour@1.1.0':
-    optional: true
+  '@img/colour@1.1.0': {}
 
   '@img/sharp-darwin-arm64@0.34.5':
     optionalDependencies:
@@ -8320,6 +8448,28 @@ snapshots:
     dependencies:
       playwright: 1.58.2
 
+  '@protobufjs/aspromise@1.1.2': {}
+
+  '@protobufjs/base64@1.1.2': {}
+
+  '@protobufjs/codegen@2.0.5': {}
+
+  '@protobufjs/eventemitter@1.1.1': {}
+
+  '@protobufjs/fetch@1.1.1':
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+
+  '@protobufjs/float@1.0.2': {}
+
+  '@protobufjs/inquire@1.1.2': {}
+
+  '@protobufjs/path@1.1.2': {}
+
+  '@protobufjs/pool@1.1.0': {}
+
+  '@protobufjs/utf8@1.1.1': {}
+
   '@radix-ui/number@1.1.1': {}
 
   '@radix-ui/primitive@1.1.3': {}
@@ -10077,6 +10227,8 @@ snapshots:
 
   acorn@8.15.0: {}
 
+  adm-zip@0.5.17: {}
+
   agent-base@7.1.4: {}
 
   ajv-formats@3.0.1(ajv@8.17.1):
@@ -10230,6 +10382,8 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  boolean@3.2.0: {}
+
   brace-expansion@1.1.12:
     dependencies:
       balanced-match: 1.0.2
@@ -10742,6 +10896,8 @@ snapshots:
 
   detect-node-es@1.1.0: {}
 
+  detect-node@2.1.0: {}
+
   devlop@1.1.0:
     dependencies:
       dequal: 2.0.3
@@ -10943,6 +11099,8 @@ snapshots:
       is-date-object: 1.1.0
       is-symbol: 1.1.1
 
+  es6-error@4.1.1: {}
+
   esbuild@0.21.5:
     optionalDependencies:
       '@esbuild/aix-ppc64': 0.21.5
@@ -11039,8 +11197,8 @@ snapshots:
       '@next/eslint-plugin-next': 16.1.1
       eslint: 9.39.2(jiti@2.6.1)
       eslint-import-resolver-node: 0.3.9
-      eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1))
-      eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1))
+      eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1))
+      eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1))
       eslint-plugin-jsx-a11y: 6.10.2(eslint@9.39.2(jiti@2.6.1))
       eslint-plugin-react: 7.37.5(eslint@9.39.2(jiti@2.6.1))
       eslint-plugin-react-hooks: 7.0.1(eslint@9.39.2(jiti@2.6.1))
@@ -11062,7 +11220,7 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
-  eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)):
+  eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1)):
     dependencies:
       '@nolyfill/is-core-module': 1.0.39
       debug: 4.4.3
@@ -11073,22 +11231,22 @@ snapshots:
       tinyglobby: 0.2.15
       unrs-resolver: 1.11.1
     optionalDependencies:
-      eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1))
+      eslint-plugin-import: 2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1))
     transitivePeerDependencies:
       - supports-color
 
-  eslint-module-utils@2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)):
+  eslint-module-utils@2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)):
     dependencies:
       debug: 3.2.7
     optionalDependencies:
       '@typescript-eslint/parser': 8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3)
       eslint: 9.39.2(jiti@2.6.1)
       eslint-import-resolver-node: 0.3.9
-      eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1))
+      eslint-import-resolver-typescript: 3.10.1(eslint-plugin-import@2.32.0)(eslint@9.39.2(jiti@2.6.1))
     transitivePeerDependencies:
       - supports-color
 
-  eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)):
+  eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1)):
     dependencies:
       '@rtsao/scc': 1.1.0
       array-includes: 3.1.9
@@ -11099,7 +11257,7 @@ snapshots:
       doctrine: 2.1.0
       eslint: 9.39.2(jiti@2.6.1)
       eslint-import-resolver-node: 0.3.9
-      eslint-module-utils: 2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1(eslint-plugin-import@2.32.0(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1)))(eslint@9.39.2(jiti@2.6.1))
+      eslint-module-utils: 2.12.1(@typescript-eslint/parser@8.53.0(eslint@9.39.2(jiti@2.6.1))(typescript@5.9.3))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.10.1)(eslint@9.39.2(jiti@2.6.1))
       hasown: 2.0.2
       is-core-module: 2.16.1
       is-glob: 4.0.3
@@ -11425,6 +11583,8 @@ snapshots:
       flatted: 3.3.3
       keyv: 4.5.4
 
+  flatbuffers@25.9.23: {}
+
   flatted@3.3.3: {}
 
   for-each@0.3.5:
@@ -11537,6 +11697,15 @@ snapshots:
     dependencies:
       is-glob: 4.0.3
 
+  global-agent@3.0.0:
+    dependencies:
+      boolean: 3.2.0
+      es6-error: 4.1.1
+      matcher: 3.0.0
+      roarr: 2.15.4
+      semver: 7.7.4
+      serialize-error: 7.0.1
+
   global-directory@4.0.1:
     dependencies:
       ini: 4.1.1
@@ -11556,6 +11725,8 @@ snapshots:
 
   graphql@16.12.0: {}
 
+  guid-typescript@1.0.9: {}
+
   hachure-fill@0.5.2: {}
 
   happy-dom@20.8.9:
@@ -11974,6 +12145,8 @@ snapshots:
 
   json-stable-stringify-without-jsonify@1.0.1: {}
 
+  json-stringify-safe@5.0.1: {}
+
   json5@1.0.2:
     dependencies:
       minimist: 1.2.8
@@ -12127,6 +12300,8 @@ snapshots:
       chalk: 5.6.2
       is-unicode-supported: 1.3.0
 
+  long@5.3.2: {}
+
   longest-streak@3.1.0: {}
 
   loose-envify@1.4.0:
@@ -12172,6 +12347,10 @@ snapshots:
 
   marked@16.4.2: {}
 
+  matcher@3.0.0:
+    dependencies:
+      escape-string-regexp: 4.0.0
+
   math-intrinsics@1.1.0: {}
 
   mdast-util-find-and-replace@3.0.2:
@@ -12655,7 +12834,7 @@ snapshots:
       react: 19.2.3
       react-dom: 19.2.3(react@19.2.3)
 
-  next@16.1.7(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3):
+  next@16.1.7(@babel/core@7.28.6)(@playwright/test@1.58.2)(react-dom@19.2.3(react@19.2.3))(react@19.2.3):
     dependencies:
       '@next/env': 16.1.7
       '@swc/helpers': 0.5.15
@@ -12664,7 +12843,7 @@ snapshots:
       postcss: 8.4.31
       react: 19.2.3
       react-dom: 19.2.3(react@19.2.3)
-      styled-jsx: 5.1.6(react@19.2.3)
+      styled-jsx: 5.1.6(@babel/core@7.28.6)(react@19.2.3)
     optionalDependencies:
       '@next/swc-darwin-arm64': 16.1.7
       '@next/swc-darwin-x64': 16.1.7
@@ -12775,6 +12954,25 @@ snapshots:
       regex: 6.1.0
       regex-recursion: 6.0.2
 
+  onnxruntime-common@1.24.0-dev.20251116-b39e144322: {}
+
+  onnxruntime-common@1.24.3: {}
+
+  onnxruntime-node@1.24.3:
+    dependencies:
+      adm-zip: 0.5.17
+      global-agent: 3.0.0
+      onnxruntime-common: 1.24.3
+
+  onnxruntime-web@1.26.0-dev.20260416-b7804b056c:
+    dependencies:
+      flatbuffers: 25.9.23
+      guid-typescript: 1.0.9
+      long: 5.3.2
+      onnxruntime-common: 1.24.0-dev.20251116-b39e144322
+      platform: 1.3.6
+      protobufjs: 7.6.1
+
   open@11.0.0:
     dependencies:
       default-browser: 5.4.0
@@ -12894,6 +13092,8 @@ snapshots:
       mlly: 1.8.0
       pathe: 2.0.3
 
+  platform@1.3.6: {}
+
   playwright-core@1.58.2: {}
 
   playwright@1.58.2:
@@ -13066,6 +13266,21 @@ snapshots:
       prosemirror-state: 1.4.4
       prosemirror-transform: 1.11.0
 
+  protobufjs@7.6.1:
+    dependencies:
+      '@protobufjs/aspromise': 1.1.2
+      '@protobufjs/base64': 1.1.2
+      '@protobufjs/codegen': 2.0.5
+      '@protobufjs/eventemitter': 1.1.1
+      '@protobufjs/fetch': 1.1.1
+      '@protobufjs/float': 1.0.2
+      '@protobufjs/inquire': 1.1.2
+      '@protobufjs/path': 1.1.2
+      '@protobufjs/pool': 1.1.0
+      '@protobufjs/utf8': 1.1.1
+      '@types/node': 20.19.28
+      long: 5.3.2
+
   proxy-addr@2.0.7:
     dependencies:
       forwarded: 0.2.0
@@ -13399,6 +13614,15 @@ snapshots:
 
   reusify@1.1.0: {}
 
+  roarr@2.15.4:
+    dependencies:
+      boolean: 3.2.0
+      detect-node: 2.1.0
+      globalthis: 1.0.4
+      json-stringify-safe: 5.0.1
+      semver-compare: 1.0.0
+      sprintf-js: 1.1.3
+
   robust-predicates@3.0.2: {}
 
   rollup@4.55.1:
@@ -13488,6 +13712,8 @@ snapshots:
       refa: 0.12.1
       regexp-ast-analysis: 0.7.1
 
+  semver-compare@1.0.0: {}
+
   semver@6.3.1: {}
 
   semver@7.7.4: {}
@@ -13508,6 +13734,10 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  serialize-error@7.0.1:
+    dependencies:
+      type-fest: 0.13.1
+
   serve-static@2.2.1:
     dependencies:
       encodeurl: 2.0.0
@@ -13615,7 +13845,6 @@ snapshots:
       '@img/sharp-win32-arm64': 0.34.5
       '@img/sharp-win32-ia32': 0.34.5
       '@img/sharp-win32-x64': 0.34.5
-    optional: true
 
   shebang-command@2.0.0:
     dependencies:
@@ -13683,6 +13912,8 @@ snapshots:
 
   split2@4.2.0: {}
 
+  sprintf-js@1.1.3: {}
+
   stable-hash@0.0.5: {}
 
   stackback@0.0.2: {}
@@ -13807,10 +14038,12 @@ snapshots:
     dependencies:
       inline-style-parser: 0.2.7
 
-  styled-jsx@5.1.6(react@19.2.3):
+  styled-jsx@5.1.6(@babel/core@7.28.6)(react@19.2.3):
     dependencies:
       client-only: 0.0.1
       react: 19.2.3
+    optionalDependencies:
+      '@babel/core': 7.28.6
 
   stylis@4.3.6: {}
 
@@ -13926,6 +14159,8 @@ snapshots:
 
   type-detect@4.1.0: {}
 
+  type-fest@0.13.1: {}
+
   type-fest@5.4.0:
     dependencies:
       tagged-tag: 1.0.0
diff --git a/apps/web/app/settings/voice-mode/page.tsx b/apps/web/app/settings/voice-mode/page.tsx
new file mode 100644
index 000000000..2bcc1b851
--- /dev/null
+++ b/apps/web/app/settings/voice-mode/page.tsx
@@ -0,0 +1,21 @@
+import { VoiceModeSettings } from "@/components/settings/voice-mode-settings";
+import { StateProvider } from "@/components/state-provider";
+import { fetchUserSettings } from "@/lib/api";
+import { mapUserSettingsResponse } from "@/lib/ssr/user-settings";
+
+export default async function VoiceModeSettingsPage() {
+  let initialState = {};
+  try {
+    const response = await fetchUserSettings({ cache: "no-store" });
+    const mapped = mapUserSettingsResponse(response);
+    initialState = { userSettings: mapped.loaded ? mapped : undefined };
+  } catch {
+    initialState = {};
+  }
+
+  return (
+    <StateProvider initialState={initialState}>
+      <VoiceModeSettings />
+    </StateProvider>
+  );
+}
diff --git a/apps/web/components/settings/editors-settings-state.tsx b/apps/web/components/settings/editors-settings-state.tsx
index 2d2e76e3b..ef891eeb5 100644
--- a/apps/web/components/settings/editors-settings-state.tsx
+++ b/apps/web/components/settings/editors-settings-state.tsx
@@ -7,7 +7,7 @@ import { createEditor, deleteEditor, updateEditor, updateUserSettings } from "@/
 import { useRequest } from "@/lib/http/use-request";
 import type { EditorOption } from "@/lib/types/http";
 import { type ComboboxOption } from "@/components/combobox";
-import { parseTerminalLinkBehavior } from "@/lib/ssr/user-settings";
+import { parseTerminalLinkBehavior, parseVoiceMode } from "@/lib/ssr/user-settings";
 import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire";
 import {
   type EditorFormState,
@@ -245,6 +245,7 @@ function buildUserSettingsFromResponse(
     terminalFontFamily: s.terminal_font_family || null,
     terminalFontSize: s.terminal_font_size || null,
     changesPanelLayout: s.changes_panel_layout === "tree" ? ("tree" as const) : ("flat" as const),
+    voiceMode: parseVoiceMode(s.voice_mode),
     ...mapEditorSettingsFields(s),
   };
 }
diff --git a/apps/web/components/settings/keyboard-shortcuts-card.tsx b/apps/web/components/settings/keyboard-shortcuts-card.tsx
index 7b1c363be..adb484e27 100644
--- a/apps/web/components/settings/keyboard-shortcuts-card.tsx
+++ b/apps/web/components/settings/keyboard-shortcuts-card.tsx
@@ -17,7 +17,7 @@ import { useAppStore } from "@/components/state-provider";
 import { useToast } from "@/components/toast-provider";
 import { updateUserSettings } from "@/lib/api/domains/settings-api";
 
-function ShortcutRecorder({
+export function ShortcutRecorder({
   shortcutId,
   current,
   onChange,
diff --git a/apps/web/components/settings/settings-app-sidebar.tsx b/apps/web/components/settings/settings-app-sidebar.tsx
index 20993c191..1ea984397 100644
--- a/apps/web/components/settings/settings-app-sidebar.tsx
+++ b/apps/web/components/settings/settings-app-sidebar.tsx
@@ -11,6 +11,7 @@ import {
   IconCode,
   IconCpu,
   IconKey,
+  IconMicrophone,
   IconMessageCircle,
   IconBrandGithub,
   IconBrandGitlab,
@@ -315,6 +316,47 @@ function ExecutorsSidebarSection({ pathname, executors }: ExecutorsSidebarSectio
   );
 }
 
+type SimpleSidebarEntry = {
+  href: string;
+  label: string;
+  Icon: typeof IconBrandGithub;
+};
+
+/**
+ * A short row of single-link sidebar entries (Automations, Prompts, Voice
+ * Mode, Utility Agents, External MCP) — extracted from `SettingsAppSidebar`
+ * so the parent function stays under the 100-line lint limit.
+ */
+function SimpleSidebarRows({
+  pathname,
+  entries,
+}: {
+  pathname: string;
+  entries: SimpleSidebarEntry[];
+}) {
+  return (
+    <>
+      {entries.map(({ href, label, Icon }) => (
+        <SidebarMenuItem key={href}>
+          <SidebarMenuButton
+            asChild
+            isActive={
+              href === "/settings/automations"
+                ? pathname.includes("/automations")
+                : pathname === href
+            }
+          >
+            <Link href={href}>
+              <Icon className="h-4 w-4" />
+              <span>{label}</span>
+            </Link>
+          </SidebarMenuButton>
+        </SidebarMenuItem>
+      ))}
+    </>
+  );
+}
+
 function SecretsSidebarSection({ pathname }: { pathname: string }) {
   return (
     <SidebarMenuItem>
@@ -369,57 +411,33 @@ export function SettingsAppSidebar() {
                 <WorkspacesSidebarSection pathname={pathname} workspaces={workspaces} />
                 <IntegrationsSidebarSection pathname={pathname} />
 
-                {/* Automations */}
-                <SidebarMenuItem>
-                  <SidebarMenuButton
-                    asChild
-                    isActive={
-                      pathname.startsWith("/settings/automations") ||
-                      pathname.includes("/automations")
-                    }
-                  >
-                    <Link href="/settings/automations">
-                      <IconBolt className="h-4 w-4" />
-                      <span>Automations</span>
-                    </Link>
-                  </SidebarMenuButton>
-                </SidebarMenuItem>
-
+                <SimpleSidebarRows
+                  pathname={pathname}
+                  entries={[
+                    { href: "/settings/automations", label: "Automations", Icon: IconBolt },
+                  ]}
+                />
                 <AgentsSidebarSection pathname={pathname} agents={agents} />
-
-                {/* Prompts */}
-                <SidebarMenuItem>
-                  <SidebarMenuButton asChild isActive={pathname === "/settings/prompts"}>
-                    <Link href="/settings/prompts">
-                      <IconMessageCircle className="h-4 w-4" />
-                      <span>Prompts</span>
-                    </Link>
-                  </SidebarMenuButton>
-                </SidebarMenuItem>
-
-                {/* Utility Agents */}
-                <SidebarMenuItem>
-                  <SidebarMenuButton asChild isActive={pathname === "/settings/utility-agents"}>
-                    <Link href="/settings/utility-agents">
-                      <IconWand className="h-4 w-4" />
-                      <span>Utility Agents</span>
-                    </Link>
-                  </SidebarMenuButton>
-                </SidebarMenuItem>
-
+                <SimpleSidebarRows
+                  pathname={pathname}
+                  entries={[
+                    { href: "/settings/prompts", label: "Prompts", Icon: IconMessageCircle },
+                    { href: "/settings/voice-mode", label: "Voice Mode", Icon: IconMicrophone },
+                    { href: "/settings/utility-agents", label: "Utility Agents", Icon: IconWand },
+                  ]}
+                />
                 <ExecutorsSidebarSection pathname={pathname} executors={executors} />
-
                 <SecretsSidebarSection pathname={pathname} />
-
-                {/* External MCP */}
-                <SidebarMenuItem>
-                  <SidebarMenuButton asChild isActive={pathname === "/settings/external-mcp"}>
-                    <Link href="/settings/external-mcp">
-                      <IconPlugConnected className="h-4 w-4" />
-                      <span>External MCP</span>
-                    </Link>
-                  </SidebarMenuButton>
-                </SidebarMenuItem>
+                <SimpleSidebarRows
+                  pathname={pathname}
+                  entries={[
+                    {
+                      href: "/settings/external-mcp",
+                      label: "External MCP",
+                      Icon: IconPlugConnected,
+                    },
+                  ]}
+                />
 
                 {/* System */}
                 <SystemSidebarSection pathname={pathname} />
diff --git a/apps/web/components/settings/voice-mode-settings.tsx b/apps/web/components/settings/voice-mode-settings.tsx
new file mode 100644
index 000000000..1a0d702cd
--- /dev/null
+++ b/apps/web/components/settings/voice-mode-settings.tsx
@@ -0,0 +1,518 @@
+"use client";
+
+import { useCallback, useMemo, useState } from "react";
+import { IconAlertTriangle, IconMicrophone } from "@tabler/icons-react";
+import { Badge } from "@kandev/ui/badge";
+import { Card, CardContent, CardHeader, CardTitle } from "@kandev/ui/card";
+import { Label } from "@kandev/ui/label";
+import { RadioGroup, RadioGroupItem } from "@kandev/ui/radio-group";
+import {
+  Select,
+  SelectContent,
+  SelectGroup,
+  SelectItem,
+  SelectLabel,
+  SelectTrigger,
+  SelectValue,
+} from "@kandev/ui/select";
+import { Switch } from "@kandev/ui/switch";
+import { useAppStore, useAppStoreApi } from "@/components/state-provider";
+import { useToast } from "@/components/toast-provider";
+import { updateUserSettings } from "@/lib/api";
+import { SettingsSection } from "@/components/settings/settings-section";
+import { ShortcutRecorder } from "@/components/settings/keyboard-shortcuts-card";
+import { detectVoiceCapabilities, type VoiceCapabilities } from "@/lib/voice/capabilities";
+import type { VoiceModeState } from "@/lib/state/slices/settings/types";
+import type { KeyboardShortcut } from "@/lib/keyboard/constants";
+import {
+  CONFIGURABLE_SHORTCUTS,
+  getShortcut,
+  type StoredShortcutOverrides,
+} from "@/lib/keyboard/shortcut-overrides";
+import type {
+  VoiceInputActivationMode,
+  VoiceInputEngine,
+  VoiceModeSettings as VoiceModeWire,
+  WhisperWebModelSize,
+} from "@/lib/types/http-voice";
+
+// Single source of truth for the language options. Web Speech reads `lang`,
+// Whisper engines treat it as a hint. "auto" defers to the browser locale.
+const LANGUAGE_OPTIONS: Array<{ value: string; label: string }> = [
+  { value: "auto", label: "Auto-detect (browser language)" },
+  { value: "en-US", label: "English (United States)" },
+  { value: "en-GB", label: "English (United Kingdom)" },
+  { value: "es-ES", label: "Spanish (Spain)" },
+  { value: "es-MX", label: "Spanish (Mexico)" },
+  { value: "pt-PT", label: "Portuguese (Portugal)" },
+  { value: "pt-BR", label: "Portuguese (Brazil)" },
+  { value: "fr-FR", label: "French" },
+  { value: "de-DE", label: "German" },
+  { value: "it-IT", label: "Italian" },
+  { value: "ja-JP", label: "Japanese" },
+  { value: "zh-CN", label: "Chinese (Simplified)" },
+];
+
+const WHISPER_MODELS: Array<{
+  value: WhisperWebModelSize;
+  label: string;
+  size: string;
+  hint: string;
+}> = [
+  { value: "tiny", label: "Tiny", size: "~40 MB", hint: "Fastest, lower accuracy" },
+  { value: "base", label: "Base", size: "~75 MB", hint: "Balanced default" },
+  { value: "small", label: "Small", size: "~240 MB", hint: "Best accuracy, slower load" },
+];
+
+function toWire(state: VoiceModeState): VoiceModeWire {
+  return {
+    enabled: state.enabled,
+    engine: state.engine,
+    language: state.language,
+    mode: state.mode,
+    auto_send: state.autoSend,
+    whisper_web_model: state.whisperWebModel,
+  };
+}
+
+// ── Save hook ────────────────────────────────────────────────────────────
+
+function useVoiceModeSaver() {
+  // Read userSettings via the store API (not as a React selector) so the
+  // async save handler reads the latest snapshot at invocation time instead
+  // of capturing a stale closure. Without this, concurrent settings updates
+  // racing with this save (or a rejection rolling back to a stale snapshot)
+  // can silently overwrite unrelated fields.
+  const storeApi = useAppStoreApi();
+  const setUserSettings = useAppStore((s) => s.setUserSettings);
+  const { toast } = useToast();
+  const [saving, setSaving] = useState(false);
+
+  const save = useCallback(
+    async (patch: Partial<VoiceModeState>) => {
+      const current = storeApi.getState().userSettings;
+      const previous = current.voiceMode;
+      const next = { ...previous, ...patch };
+      setUserSettings({ ...current, voiceMode: next });
+      setSaving(true);
+      try {
+        await updateUserSettings({ voice_mode: toWire(next) });
+      } catch {
+        // Rollback only the keys this request changed AND only when the live
+        // value still matches what we optimistically wrote. If a newer save
+        // for the same key landed first, that's now the truth — reverting
+        // would silently roll back the user's later edit.
+        const latest = storeApi.getState().userSettings;
+        const reverted: Partial<VoiceModeState> = {};
+        for (const key of Object.keys(patch) as Array<keyof VoiceModeState>) {
+          if (latest.voiceMode[key] !== next[key]) continue;
+          // Cast through unknown so the per-key assignment passes strict checks.
+          (reverted as Record<string, unknown>)[key] = previous[key];
+        }
+        setUserSettings({
+          ...latest,
+          voiceMode: { ...latest.voiceMode, ...reverted },
+        });
+        toast({ title: "Failed to save Voice Mode setting", variant: "error" });
+      } finally {
+        setSaving(false);
+      }
+    },
+    [storeApi, setUserSettings, toast],
+  );
+
+  return { save, saving };
+}
+
+// ── Engine card ──────────────────────────────────────────────────────────
+
+type EngineOption = {
+  value: VoiceInputEngine;
+  label: string;
+  description: string;
+  badge?: string;
+  disabled?: boolean;
+};
+
+function buildEngineOptions(caps: VoiceCapabilities): EngineOption[] {
+  return [
+    {
+      value: "auto",
+      label: "Automatic",
+      description: "Use the best engine available in this browser.",
+    },
+    {
+      value: "webSpeech",
+      label: "Web Speech (in-browser)",
+      description: caps.webSpeech
+        ? "Free, instant, uses your browser's built-in speech recognition."
+        : "Not supported in this browser.",
+      disabled: !caps.webSpeech,
+    },
+    {
+      value: "whisperWeb",
+      label: "Whisper Web (private, in-browser)",
+      description: caps.whisperWeb
+        ? "Runs OpenAI Whisper entirely on this device. First use downloads the model (40–240 MB)."
+        : "Not supported in this browser.",
+      badge: "Local",
+      disabled: !caps.whisperWeb,
+    },
+    {
+      value: "whisperServer",
+      label: "Whisper Server (OpenAI)",
+      description: caps.audioCapture
+        ? "Sends audio to the backend, which forwards it to OpenAI's Whisper API. Requires a configured API key on the server."
+        : "Not supported in this browser.",
+      badge: "Server",
+      disabled: !caps.audioCapture,
+    },
+  ];
+}
+
+function EngineCard({ caps }: { caps: VoiceCapabilities }) {
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const { save, saving } = useVoiceModeSaver();
+  const options = useMemo(() => buildEngineOptions(caps), [caps]);
+
+  return (
+    <Card>
+      <CardHeader>
+        <CardTitle className="text-base">Transcription Engine</CardTitle>
+      </CardHeader>
+      <CardContent>
+        <RadioGroup
+          value={voiceMode.engine}
+          onValueChange={(v) => save({ engine: v as VoiceInputEngine })}
+          disabled={saving}
+          className="space-y-3"
+        >
+          {options.map((opt) => (
+            <Label
+              key={opt.value}
+              htmlFor={`voice-engine-${opt.value}`}
+              className={`flex items-start gap-3 rounded-md border p-3 ${
+                opt.disabled ? "opacity-50" : "cursor-pointer hover:bg-muted/30"
+              }`}
+            >
+              <RadioGroupItem
+                id={`voice-engine-${opt.value}`}
+                value={opt.value}
+                disabled={opt.disabled}
+                className="mt-0.5"
+              />
+              <div className="space-y-1">
+                <div className="flex items-center gap-2 text-sm font-medium">
+                  {opt.label}
+                  {opt.badge && <Badge variant="secondary">{opt.badge}</Badge>}
+                </div>
+                <p className="text-xs text-muted-foreground">{opt.description}</p>
+              </div>
+            </Label>
+          ))}
+        </RadioGroup>
+      </CardContent>
+    </Card>
+  );
+}
+
+// ── Behavior card (language + mode + auto-send) ──────────────────────────
+
+function LanguageRow() {
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const { save, saving } = useVoiceModeSaver();
+  return (
+    <div className="space-y-2">
+      <Label htmlFor="voice-language">Language</Label>
+      <Select
+        value={voiceMode.language}
+        onValueChange={(v) => save({ language: v })}
+        disabled={saving}
+      >
+        <SelectTrigger id="voice-language">
+          <SelectValue />
+        </SelectTrigger>
+        <SelectContent>
+          <SelectGroup>
+            <SelectLabel>Languages</SelectLabel>
+            {LANGUAGE_OPTIONS.map((l) => (
+              <SelectItem key={l.value} value={l.value}>
+                {l.label}
+              </SelectItem>
+            ))}
+          </SelectGroup>
+        </SelectContent>
+      </Select>
+      <p className="text-xs text-muted-foreground">
+        Recognition quality drops sharply when the language doesn&apos;t match what you&apos;re
+        speaking.
+      </p>
+    </div>
+  );
+}
+
+function ModeRow() {
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const { save, saving } = useVoiceModeSaver();
+  return (
+    <div className="space-y-2">
+      <Label>Activation</Label>
+      <RadioGroup
+        value={voiceMode.mode}
+        onValueChange={(v) => save({ mode: v as VoiceInputActivationMode })}
+        disabled={saving}
+        className="flex gap-4"
+      >
+        <Label htmlFor="voice-mode-toggle" className="flex items-center gap-2 cursor-pointer">
+          <RadioGroupItem id="voice-mode-toggle" value="toggle" />
+          <span className="text-sm">Click to start / stop</span>
+        </Label>
+        <Label htmlFor="voice-mode-hold" className="flex items-center gap-2 cursor-pointer">
+          <RadioGroupItem id="voice-mode-hold" value="hold" />
+          <span className="text-sm">Hold to talk</span>
+        </Label>
+      </RadioGroup>
+    </div>
+  );
+}
+
+function AutoSendRow() {
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const { save, saving } = useVoiceModeSaver();
+  return (
+    <div className="flex items-center justify-between">
+      <div className="space-y-1">
+        <Label htmlFor="voice-auto-send" className="cursor-pointer">
+          Auto-send after transcription
+        </Label>
+        <p className="text-xs text-muted-foreground">
+          Submit the message as soon as the transcript is inserted.
+        </p>
+      </div>
+      <Switch
+        id="voice-auto-send"
+        checked={voiceMode.autoSend}
+        onCheckedChange={(checked) => save({ autoSend: checked })}
+        disabled={saving}
+      />
+    </div>
+  );
+}
+
+function BehaviorCard() {
+  return (
+    <Card>
+      <CardHeader>
+        <CardTitle className="text-base">Behavior</CardTitle>
+      </CardHeader>
+      <CardContent className="space-y-5">
+        <LanguageRow />
+        <ModeRow />
+        <AutoSendRow />
+      </CardContent>
+    </Card>
+  );
+}
+
+// ── Whisper Web model card ───────────────────────────────────────────────
+
+function WhisperModelCard() {
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const { save, saving } = useVoiceModeSaver();
+
+  return (
+    <Card>
+      <CardHeader>
+        <CardTitle className="text-base">Whisper Web Model</CardTitle>
+      </CardHeader>
+      <CardContent>
+        <RadioGroup
+          value={voiceMode.whisperWebModel}
+          onValueChange={(v) => save({ whisperWebModel: v as WhisperWebModelSize })}
+          disabled={saving}
+          className="space-y-2"
+        >
+          {WHISPER_MODELS.map((m) => (
+            <Label
+              key={m.value}
+              htmlFor={`whisper-model-${m.value}`}
+              className="flex items-start gap-3 rounded-md border p-3 cursor-pointer hover:bg-muted/30"
+            >
+              <RadioGroupItem id={`whisper-model-${m.value}`} value={m.value} className="mt-0.5" />
+              <div>
+                <div className="text-sm font-medium">
+                  {m.label} <span className="text-muted-foreground font-normal">· {m.size}</span>
+                </div>
+                <p className="text-xs text-muted-foreground">{m.hint}</p>
+              </div>
+            </Label>
+          ))}
+        </RadioGroup>
+        <p className="text-xs text-muted-foreground mt-3">
+          The model downloads on first use and is cached in your browser. Switching models triggers
+          another download next time you record.
+        </p>
+      </CardContent>
+    </Card>
+  );
+}
+
+// ── Enable card (top-level on/off) ───────────────────────────────────────
+
+function EnableCard() {
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const { save, saving } = useVoiceModeSaver();
+  return (
+    <Card>
+      <CardHeader>
+        <CardTitle className="text-base">Enable Voice Input</CardTitle>
+      </CardHeader>
+      <CardContent>
+        <div className="flex items-center justify-between">
+          <div className="space-y-1">
+            <Label htmlFor="voice-enabled" className="cursor-pointer">
+              Show the mic button on the chat composer
+            </Label>
+            <p className="text-xs text-muted-foreground">
+              When off, the voice button is hidden entirely and no voice-related code runs. Settings
+              below are preserved and re-applied when you turn it back on.
+            </p>
+          </div>
+          <Switch
+            id="voice-enabled"
+            checked={voiceMode.enabled}
+            onCheckedChange={(checked) => save({ enabled: checked })}
+            disabled={saving}
+          />
+        </div>
+      </CardContent>
+    </Card>
+  );
+}
+
+// ── Availability banner ──────────────────────────────────────────────────
+
+function AvailabilityBanner({ caps }: { caps: VoiceCapabilities }) {
+  if (caps.webSpeech || caps.whisperWeb || caps.audioCapture) return null;
+  // Secure-context requirement is the most common reason capability detection
+  // returns all-false on mobile (when reaching the dev server over LAN HTTP).
+  // Spell it out so the user doesn't have to guess.
+  const insecure = typeof window !== "undefined" && !window.isSecureContext;
+  return (
+    <div className="flex items-start gap-3 rounded-md border border-orange-500/40 bg-orange-500/5 p-3">
+      <IconAlertTriangle className="h-5 w-5 text-orange-500 shrink-0 mt-0.5" />
+      <div className="space-y-1 text-sm">
+        <p className="font-medium">Voice input is unavailable in this browser.</p>
+        <p className="text-xs text-muted-foreground">
+          {insecure
+            ? "Microphone APIs require HTTPS or localhost. You appear to be on an insecure HTTP origin — load this page over HTTPS (or http://localhost) to enable voice input."
+            : "Your browser doesn't expose either the Web Speech API or MediaRecorder. Try Chrome, Edge, or Safari 14.5+."}
+        </p>
+      </div>
+    </div>
+  );
+}
+
+// ── Voice keyboard shortcut card ─────────────────────────────────────────
+
+function useShortcutSaver() {
+  // Same stale-closure protection as useVoiceModeSaver — read live store
+  // state at call time so a concurrent keyboard-shortcut change from another
+  // settings card isn't clobbered by this card's optimistic update / rollback.
+  const storeApi = useAppStoreApi();
+  const setUserSettings = useAppStore((s) => s.setUserSettings);
+  const { toast } = useToast();
+  return useCallback(
+    (next: StoredShortcutOverrides) => {
+      const current = storeApi.getState().userSettings;
+      const previous = current.keyboardShortcuts;
+      setUserSettings({ ...current, keyboardShortcuts: next });
+      updateUserSettings({ keyboard_shortcuts: next }).catch(() => {
+        // Rollback only the keys this request changed AND only when the live
+        // value still matches what we optimistically wrote. Skip otherwise so
+        // a newer successful save to the same key isn't reverted.
+        const latest = storeApi.getState().userSettings;
+        const restored: StoredShortcutOverrides = { ...latest.keyboardShortcuts };
+        const changedKeys = new Set([...Object.keys(previous), ...Object.keys(next)]);
+        for (const key of changedKeys) {
+          if (previous[key] === next[key]) continue;
+          if (latest.keyboardShortcuts[key] !== next[key]) continue;
+          if (previous[key] === undefined) delete restored[key];
+          else restored[key] = previous[key];
+        }
+        setUserSettings({ ...latest, keyboardShortcuts: restored });
+        toast({ title: "Failed to save shortcut", variant: "error" });
+      });
+    },
+    [storeApi, setUserSettings, toast],
+  );
+}
+
+function VoiceShortcutCard() {
+  const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts);
+  const persist = useShortcutSaver();
+  const current = getShortcut("VOICE_INPUT_TOGGLE", overrides);
+
+  const handleChange = useCallback(
+    (_id: string, shortcut: KeyboardShortcut) =>
+      persist({ ...overrides, VOICE_INPUT_TOGGLE: shortcut }),
+    [overrides, persist],
+  );
+  const handleReset = useCallback(() => {
+    const next = { ...overrides };
+    delete next.VOICE_INPUT_TOGGLE;
+    persist(next);
+  }, [overrides, persist]);
+
+  return (
+    <Card>
+      <CardHeader>
+        <CardTitle className="text-base">
+          {CONFIGURABLE_SHORTCUTS.VOICE_INPUT_TOGGLE.label} Shortcut
+        </CardTitle>
+      </CardHeader>
+      <CardContent>
+        <ShortcutRecorder
+          shortcutId="VOICE_INPUT_TOGGLE"
+          current={current}
+          onChange={handleChange}
+          onReset={handleReset}
+        />
+        <p className="text-xs text-muted-foreground mt-2">
+          Click the shortcut to record a new key combination. All keyboard shortcuts can also be
+          edited in General Settings.
+        </p>
+      </CardContent>
+    </Card>
+  );
+}
+
+// ── Page ─────────────────────────────────────────────────────────────────
+
+export function VoiceModeSettings() {
+  const caps = useMemo(() => detectVoiceCapabilities(), []);
+  const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled);
+  return (
+    <SettingsSection
+      icon={<IconMicrophone className="h-5 w-5" />}
+      title="Voice Mode"
+      description="Configure how voice input works on the chat composer."
+    >
+      <div className="space-y-4">
+        <EnableCard />
+        {/* When voice is disabled, keep showing the secondary cards but dim
+            them — preserves the visible configuration without implying it has
+            any effect right now. */}
+        <div className={enabled ? undefined : "opacity-50 pointer-events-none"}>
+          <div className="space-y-4">
+            <AvailabilityBanner caps={caps} />
+            <EngineCard caps={caps} />
+            <BehaviorCard />
+            <WhisperModelCard />
+            <VoiceShortcutCard />
+          </div>
+        </div>
+      </div>
+    </SettingsSection>
+  );
+}
diff --git a/apps/web/components/task/chat/chat-input-body.tsx b/apps/web/components/task/chat/chat-input-body.tsx
index aad579f9d..1141c038a 100644
--- a/apps/web/components/task/chat/chat-input-body.tsx
+++ b/apps/web/components/task/chat/chat-input-body.tsx
@@ -52,6 +52,10 @@ export type ChatInputEditorAreaProps = {
   onEnhancePrompt?: () => void;
   isEnhancingPrompt?: boolean;
   isUtilityConfigured?: boolean;
+  /** Inserts a voice transcript into the editor at the current cursor. */
+  onVoiceTranscript?: (text: string) => void;
+  /** Submit the message after a voice transcript is inserted (when auto-send is on). */
+  onVoiceAutoSend?: () => void;
 };
 
 function EditorWithTooltip({
@@ -123,6 +127,7 @@ export function ChatInputEditorArea(p: ChatInputEditorAreaProps) {
   const { isSending, onCancel, contextCount, contextPopoverOpen, setContextPopoverOpen } = p;
   const { contextFiles, onImplementPlan, onEnhancePrompt, isEnhancingPrompt } = p;
   const { isUtilityConfigured, hideSessionsDropdown, minimalToolbar, hidePlanMode } = p;
+  const { onVoiceTranscript, onVoiceAutoSend } = p;
   // Exclude auto-added plan context from the count — it's always present in plan mode
   // and shouldn't by itself enable the send button.
   const userContextCount = planContextEnabled ? Math.max(0, contextCount - 1) : contextCount;
@@ -186,6 +191,8 @@ export function ChatInputEditorArea(p: ChatInputEditorAreaProps) {
         isEnhancingPrompt={isEnhancingPrompt}
         isUtilityConfigured={isUtilityConfigured}
         onAttachFiles={handleAttachFiles}
+        onVoiceTranscript={onVoiceTranscript}
+        onVoiceAutoSend={onVoiceAutoSend}
         hideSessionsDropdown={hideSessionsDropdown}
         minimalToolbar={minimalToolbar}
         hidePlanMode={hidePlanMode}
diff --git a/apps/web/components/task/chat/chat-input-container.tsx b/apps/web/components/task/chat/chat-input-container.tsx
index 5975e4550..22ffd9cd9 100644
--- a/apps/web/components/task/chat/chat-input-container.tsx
+++ b/apps/web/components/task/chat/chat-input-container.tsx
@@ -250,6 +250,8 @@ type EnhancePromptExtras = {
   onEnhancePrompt?: () => void;
   isEnhancingPrompt?: boolean;
   isUtilityConfigured?: boolean;
+  onVoiceTranscript?: (text: string) => void;
+  onVoiceAutoSend?: () => void;
 };
 
 function buildEditorAreaProps(
@@ -295,6 +297,8 @@ function buildEditorAreaProps(
     onEnhancePrompt: extras.onEnhancePrompt,
     isEnhancingPrompt: extras.isEnhancingPrompt,
     isUtilityConfigured: extras.isUtilityConfigured,
+    onVoiceTranscript: extras.onVoiceTranscript,
+    onVoiceAutoSend: extras.onVoiceAutoSend,
     hideSessionsDropdown: p.hideSessionsDropdown,
     minimalToolbar: p.minimalToolbar,
     hidePlanMode: p.hidePlanMode,
@@ -359,6 +363,34 @@ export const ChatInputContainer = forwardRef<ChatInputContainerHandle, ChatInput
       });
     }, [s, enhancePrompt]);
 
+    const handleVoiceTranscript = useCallback(
+      (text: string) => {
+        const editor = s.inputRef.current;
+        if (!editor) return;
+        const trimmed = text.trim();
+        if (!trimmed) return;
+        const cursor = editor.getSelectionStart();
+        const current = editor.getValue();
+        // Prepend a space when inserting after existing non-whitespace content
+        // so transcripts flow naturally without running into the previous word.
+        const charBefore = cursor > 0 ? current.charAt(cursor - 1) : "";
+        const needsLeadingSpace = charBefore !== "" && !/\s/.test(charBefore);
+        const insert = needsLeadingSpace ? ` ${trimmed}` : trimmed;
+        editor.insertText(insert, cursor, cursor);
+      },
+      [s.inputRef],
+    );
+
+    // Auto-send fires the same submit path as the regular send button. Guards
+    // against firing while the input is in a disabled state (e.g. the agent
+    // is currently booting) — the button is hidden in that case anyway, but
+    // defence-in-depth so a stale keyboard shortcut press doesn't trigger.
+    const { submitDisabled: voiceSubmitDisabled, handleSubmitWithReset: voiceSubmit } = s;
+    const handleVoiceAutoSend = useCallback(() => {
+      if (voiceSubmitDisabled) return;
+      voiceSubmit();
+    }, [voiceSubmitDisabled, voiceSubmit]);
+
     if (p.isFailed || executorUnavailable) {
       return (
         <FailedSessionBanner
@@ -390,6 +422,8 @@ export const ChatInputContainer = forwardRef<ChatInputContainerHandle, ChatInput
           onEnhancePrompt: handleEnhancePrompt,
           isEnhancingPrompt,
           isUtilityConfigured,
+          onVoiceTranscript: handleVoiceTranscript,
+          onVoiceAutoSend: handleVoiceAutoSend,
         })}
       />
     );
diff --git a/apps/web/components/task/chat/chat-input-toolbar.tsx b/apps/web/components/task/chat/chat-input-toolbar.tsx
index 2447211d5..d8cb9ca10 100644
--- a/apps/web/components/task/chat/chat-input-toolbar.tsx
+++ b/apps/web/components/task/chat/chat-input-toolbar.tsx
@@ -31,6 +31,7 @@ import { ModeSelector } from "@/components/task/mode-selector";
 import { ContextPopover } from "./context-popover";
 import { ResetContextButton } from "./reset-context-button";
 import { ImplementPlanButton } from "./implement-plan-button";
+import { VoiceInputButton } from "./voice-input-button";
 import type { ContextFile } from "@/lib/state/context-files-store";
 
 export type ChatInputToolbarProps = {
@@ -67,6 +68,12 @@ export type ChatInputToolbarProps = {
   isUtilityConfigured?: boolean;
   /** Callback to open file picker for attaching files */
   onAttachFiles?: () => void;
+  /** Callback to insert a transcribed voice utterance into the editor. When
+   *  omitted, the voice button is hidden — keeps quick-chat / read-only
+   *  variants free of a button they can't wire. */
+  onVoiceTranscript?: (text: string) => void;
+  /** Optional auto-send hook fired after a voice transcript is inserted. */
+  onVoiceAutoSend?: () => void;
   /** Hide the sessions dropdown (for quick chat) */
   hideSessionsDropdown?: boolean;
   /** When true, only render the submit/cancel button — no other controls */
@@ -308,6 +315,8 @@ function ToolbarRightSection({
   onCancel,
   onSubmit,
   submitShortcut,
+  onVoiceTranscript,
+  onVoiceAutoSend,
 }: {
   showCollapsed: boolean;
   rightItems: ToolbarItemConfig[];
@@ -322,6 +331,8 @@ function ToolbarRightSection({
   onCancel: () => void;
   onSubmit: () => void;
   submitShortcut: (typeof SHORTCUTS)[keyof typeof SHORTCUTS];
+  onVoiceTranscript?: (text: string) => void;
+  onVoiceAutoSend?: () => void;
 }) {
   return (
     <div className="flex items-center gap-0.5 shrink-0">
@@ -330,7 +341,14 @@ function ToolbarRightSection({
       {planModeEnabled && !isAgentBusy && onImplementPlan && (
         <ImplementPlanButton onClick={onImplementPlan} />
       )}
-      <div className="ml-1">
+      <div className="ml-1 flex items-center gap-1">
+        {onVoiceTranscript && (
+          <VoiceInputButton
+            onTranscript={onVoiceTranscript}
+            onAutoSend={onVoiceAutoSend}
+            disabled={isDisabled}
+          />
+        )}
         <SubmitButton
           isAgentBusy={isAgentBusy}
           hasContent={hasContent}
@@ -588,6 +606,8 @@ export const ChatInputToolbar = memo(function ChatInputToolbar(rawProps: ChatInp
         onCancel={props.onCancel}
         onSubmit={props.onSubmit}
         submitShortcut={submitShortcut}
+        onVoiceTranscript={props.onVoiceTranscript}
+        onVoiceAutoSend={props.onVoiceAutoSend}
       />
     </div>
   );
diff --git a/apps/web/components/task/chat/voice-input-button.tsx b/apps/web/components/task/chat/voice-input-button.tsx
new file mode 100644
index 000000000..978bfebda
--- /dev/null
+++ b/apps/web/components/task/chat/voice-input-button.tsx
@@ -0,0 +1,265 @@
+"use client";
+
+import { useCallback, useEffect, useRef } from "react";
+import { IconLoader2, IconMicrophone, IconPlayerStopFilled } from "@tabler/icons-react";
+
+import { Button } from "@kandev/ui/button";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@kandev/ui/tooltip";
+import { cn } from "@/lib/utils";
+import {
+  useVoiceInput,
+  type VoiceError,
+  type VoiceInputState,
+  type VoiceModelLoadState,
+} from "@/hooks/use-voice-input";
+import { useAppStore } from "@/components/state-provider";
+import { useKeyboardShortcut } from "@/hooks/use-keyboard-shortcut";
+import { useToast } from "@/components/toast-provider";
+import { getShortcut } from "@/lib/keyboard/shortcut-overrides";
+
+type VoiceInputButtonProps = {
+  /** Inserts the recognized transcript at the current cursor position. */
+  onTranscript: (text: string) => void;
+  /** Called after a non-empty transcript was inserted, when auto-send is enabled. */
+  onAutoSend?: () => void;
+  /** Disable while the chat input itself is disabled (sending / starting / failed). */
+  disabled?: boolean;
+};
+
+const TOOLTIP_BY_STATE: Record<VoiceInputState, string> = {
+  idle: "Voice input",
+  requesting: "Requesting microphone…",
+  recording: "Stop recording",
+  processing: "Transcribing…",
+};
+
+const ARIA_BY_STATE: Record<VoiceInputState, string> = {
+  idle: "Start voice input",
+  requesting: "Requesting microphone permission",
+  recording: "Stop voice input",
+  processing: "Transcribing voice input",
+};
+
+function ButtonIcon({
+  state,
+  modelLoad,
+}: {
+  state: VoiceInputState;
+  modelLoad: VoiceModelLoadState;
+}) {
+  if (state === "processing" || state === "requesting" || modelLoad.state === "loading") {
+    return <IconLoader2 className="h-4 w-4 animate-spin" />;
+  }
+  if (state === "recording") {
+    return <IconPlayerStopFilled className="h-3.5 w-3.5" />;
+  }
+  return <IconMicrophone className="h-4 w-4" />;
+}
+
+function toastForError(toast: ReturnType<typeof useToast>["toast"], err: VoiceError) {
+  if (err.code === "no-speech") {
+    toast({ title: err.message });
+    return;
+  }
+  toast({ title: err.message, variant: "error" });
+}
+
+// ── Activation handlers ──────────────────────────────────────────────────
+
+function buildHoldHandlers(start: () => Promise<void>, stop: () => Promise<void>) {
+  return {
+    onPointerDown: (e: React.PointerEvent) => {
+      e.preventDefault();
+      void start();
+    },
+    onPointerUp: (e: React.PointerEvent) => {
+      e.preventDefault();
+      void stop();
+    },
+    onPointerLeave: () => void stop(),
+    onPointerCancel: () => void stop(),
+  };
+}
+
+function buildToggleHandler(
+  state: VoiceInputState,
+  start: () => Promise<void>,
+  stop: () => Promise<void>,
+) {
+  return () => {
+    if (state === "idle") void start();
+    else if (state === "recording") void stop();
+  };
+}
+
+// ── Hook composition ─────────────────────────────────────────────────────
+
+function useAutoSendOnTranscript(
+  baseOnTranscript: (text: string) => void,
+  onAutoSend: (() => void) | undefined,
+  enabled: boolean,
+) {
+  // Wrap onTranscript so we can defer auto-send until after the transcript
+  // has been inserted. requestAnimationFrame keeps a clean separation between
+  // the editor update and the submit handler, so the editor's onChange has
+  // already flushed when submit reads from it.
+  return useCallback(
+    (text: string) => {
+      baseOnTranscript(text);
+      if (enabled && onAutoSend) requestAnimationFrame(onAutoSend);
+    },
+    [baseOnTranscript, onAutoSend, enabled],
+  );
+}
+
+function useVoiceShortcut(
+  enabled: boolean,
+  state: VoiceInputState,
+  start: () => Promise<void>,
+  stop: () => Promise<void>,
+) {
+  const overrides = useAppStore((s) => s.userSettings.keyboardShortcuts);
+  const shortcut = getShortcut("VOICE_INPUT_TOGGLE", overrides);
+  const stateRef = useRef(state);
+  useEffect(() => {
+    stateRef.current = state;
+  }, [state]);
+  const handler = useCallback(() => {
+    if (stateRef.current === "idle") void start();
+    else if (stateRef.current === "recording") void stop();
+  }, [start, stop]);
+  useKeyboardShortcut(shortcut, handler, { enabled });
+}
+
+// ── Unsupported fallback ────────────────────────────────────────────────
+
+function buildUnsupportedReason(): string {
+  if (typeof window === "undefined") return "Voice input is unavailable here.";
+  if (!window.isSecureContext) {
+    return "Voice input needs HTTPS. Open this site over https:// (or http://localhost) — most mobile browsers block microphone APIs on insecure origins.";
+  }
+  return "Voice input isn't supported in this browser. Try Chrome, Edge, or Safari 14.5+.";
+}
+
+function UnsupportedVoiceButton({ disabled }: { disabled?: boolean }) {
+  const { toast } = useToast();
+  const handleClick = () => {
+    toast({
+      title: "Voice input unavailable",
+      description: buildUnsupportedReason(),
+      variant: "error",
+    });
+  };
+  return (
+    <Tooltip>
+      <TooltipTrigger asChild>
+        <Button
+          type="button"
+          variant="secondary"
+          size="icon"
+          aria-label="Voice input unavailable"
+          data-testid="voice-input-button"
+          data-state="unsupported"
+          disabled={!!disabled}
+          onClick={handleClick}
+          className="h-7 w-7 rounded-full cursor-pointer text-muted-foreground/60"
+        >
+          <IconMicrophone className="h-3.5 w-3.5" />
+        </Button>
+      </TooltipTrigger>
+      <TooltipContent>Voice input unavailable — tap for details</TooltipContent>
+    </Tooltip>
+  );
+}
+
+// ── Component ────────────────────────────────────────────────────────────
+
+export function VoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) {
+  const enabled = useAppStore((s) => s.userSettings.voiceMode.enabled);
+  // Render nothing — including no hook subscriptions — when the user has
+  // disabled the feature in settings. Distinct from `!supported` (browser
+  // limitation) which shows a tappable greyed icon. Done as a sub-component
+  // so the unconditional hook count stays the same in the active path.
+  if (!enabled) return null;
+  return (
+    <EnabledVoiceInputButton
+      onTranscript={onTranscript}
+      onAutoSend={onAutoSend}
+      disabled={disabled}
+    />
+  );
+}
+
+function EnabledVoiceInputButton({ onTranscript, onAutoSend, disabled }: VoiceInputButtonProps) {
+  const { toast } = useToast();
+  const voiceMode = useAppStore((s) => s.userSettings.voiceMode);
+  const handleError = useCallback((err: VoiceError) => toastForError(toast, err), [toast]);
+  const wrappedTranscript = useAutoSendOnTranscript(onTranscript, onAutoSend, voiceMode.autoSend);
+
+  const { supported, state, modelLoad, start, stop, cancel } = useVoiceInput({
+    onTranscript: wrappedTranscript,
+    onError: handleError,
+  });
+
+  // If the chat input gets disabled mid-recording, cancel rather than leave
+  // the mic indicator on. Hold-mode pointerup may not fire if focus moves.
+  useEffect(() => {
+    if (disabled && (state === "recording" || state === "requesting")) cancel();
+  }, [disabled, state, cancel]);
+
+  useVoiceShortcut(supported && !disabled, state, start, stop);
+
+  // Always render the button — even when unsupported — so users can see it on
+  // mobile and tap to learn why voice input isn't working (usually a missing
+  // secure context, e.g. when reaching the dev server over LAN HTTP). Hiding
+  // the button silently left mobile users with no discoverable feedback.
+  if (!supported) return <UnsupportedVoiceButton disabled={disabled} />;
+
+  const isRecording = state === "recording";
+  const isBusy = state === "requesting" || state === "processing" || modelLoad.state === "loading";
+  const holdMode = voiceMode.mode === "hold";
+
+  const pointerHandlers = holdMode ? buildHoldHandlers(start, stop) : {};
+  const onClick = holdMode ? undefined : buildToggleHandler(state, start, stop);
+
+  // Styled to mirror SubmitButton (h-7 w-7 rounded-full primary fill) so the
+  // two prominent input actions read as a pair on the right of the toolbar.
+  // Recording flips to a destructive fill with a pulsing ring so the active
+  // state is unmistakable even on mobile.
+  return (
+    <Tooltip>
+      <TooltipTrigger asChild>
+        <Button
+          type="button"
+          variant="default"
+          size="icon"
+          aria-label={ARIA_BY_STATE[state]}
+          aria-pressed={isRecording}
+          data-testid="voice-input-button"
+          data-state={state}
+          data-mode={voiceMode.mode}
+          disabled={!!disabled || (isBusy && state !== "recording")}
+          onClick={onClick}
+          {...pointerHandlers}
+          className={cn(
+            "h-7 w-7 rounded-full cursor-pointer relative select-none",
+            isRecording && "bg-destructive text-destructive-foreground hover:bg-destructive/90",
+          )}
+        >
+          <ButtonIcon state={state} modelLoad={modelLoad} />
+          {isRecording && (
+            <span
+              aria-hidden
+              className="absolute inset-0 rounded-full ring-2 ring-destructive/40 animate-pulse"
+            />
+          )}
+        </Button>
+      </TooltipTrigger>
+      <TooltipContent>
+        {modelLoad.state === "loading"
+          ? `Loading model… ${Math.round(modelLoad.progress * 100)}%`
+          : `${TOOLTIP_BY_STATE[state]}${holdMode && state === "idle" ? " (hold)" : ""}`}
+      </TooltipContent>
+    </Tooltip>
+  );
+}
diff --git a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts
index 3722d205b..fdbcbfb42 100644
--- a/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts
+++ b/apps/web/e2e/tests/chat/toolbar-overflow.spec.ts
@@ -95,8 +95,10 @@ test.describe("Toolbar overflow menu", () => {
     // Context badge should be hidden when collapsed to avoid clipping
     await expect(contextBadge).not.toBeVisible();
 
-    // Submit button should remain visible (always-visible item)
-    const submitBtn = toolbar.locator("button.rounded-full");
+    // Submit button should remain visible (always-visible item). Target the
+    // submit testid specifically — the voice input button is also round, so a
+    // bare `button.rounded-full` locator now matches both and fails strict mode.
+    const submitBtn = toolbar.getByTestId("submit-message-button");
     await expect(submitBtn).toBeVisible();
 
     // Click expand toggle — items appear inline (scrollable)
diff --git a/apps/web/hooks/use-user-display-settings.ts b/apps/web/hooks/use-user-display-settings.ts
index 250e2bac2..c06dfb5c2 100644
--- a/apps/web/hooks/use-user-display-settings.ts
+++ b/apps/web/hooks/use-user-display-settings.ts
@@ -6,7 +6,10 @@ import { useAppStore } from "@/components/state-provider";
 import { useRepositories } from "@/hooks/domains/workspace/use-repositories";
 import { mapUserSettingsResponse } from "@/lib/ssr/user-settings";
 import { repositoryId, type Repository } from "@/lib/types/http";
-import type { UserSettingsState } from "@/lib/state/slices/settings/types";
+import {
+  DEFAULT_VOICE_MODE_STATE,
+  type UserSettingsState,
+} from "@/lib/state/slices/settings/types";
 
 type DisplaySettings = UserSettingsState;
 
@@ -36,7 +39,15 @@ function carryForwardTerminalSettings(current: DisplaySettings) {
   };
 }
 
-function carryForwardSettings(current: DisplaySettings) {
+function carryForwardLspSettings(current: DisplaySettings) {
+  return {
+    lspAutoStartLanguages: current.lspAutoStartLanguages ?? [],
+    lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [],
+    lspServerConfigs: current.lspServerConfigs ?? {},
+  };
+}
+
+function carryForwardCoreSettings(current: DisplaySettings) {
   return {
     shellOptions: current.shellOptions ?? [],
     defaultEditorId: current.defaultEditorId ?? null,
@@ -44,14 +55,19 @@ function carryForwardSettings(current: DisplaySettings) {
     reviewAutoMarkOnScroll: current.reviewAutoMarkOnScroll ?? true,
     showReleaseNotification: current.showReleaseNotification ?? true,
     releaseNotesLastSeenVersion: current.releaseNotesLastSeenVersion ?? null,
-    lspAutoStartLanguages: current.lspAutoStartLanguages ?? [],
-    lspAutoInstallLanguages: current.lspAutoInstallLanguages ?? [],
-    lspServerConfigs: current.lspServerConfigs ?? {},
     savedLayouts: current.savedLayouts ?? [],
     sidebarViews: current.sidebarViews ?? [],
     defaultUtilityAgentId: current.defaultUtilityAgentId ?? null,
     keyboardShortcuts: current.keyboardShortcuts ?? {},
     changesPanelLayout: current.changesPanelLayout ?? "flat",
+    voiceMode: current.voiceMode ?? { ...DEFAULT_VOICE_MODE_STATE },
+  };
+}
+
+function carryForwardSettings(current: DisplaySettings) {
+  return {
+    ...carryForwardCoreSettings(current),
+    ...carryForwardLspSettings(current),
     ...carryForwardTerminalSettings(current),
   };
 }
diff --git a/apps/web/hooks/use-voice-input.test.ts b/apps/web/hooks/use-voice-input.test.ts
new file mode 100644
index 000000000..3137cceba
--- /dev/null
+++ b/apps/web/hooks/use-voice-input.test.ts
@@ -0,0 +1,199 @@
+import { act, renderHook, waitFor } from "@testing-library/react";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+
+// ── Hoisted mocks (defined before the modules they replace are evaluated) ──
+
+const voicePrefs = vi.hoisted(() => ({
+  value: {
+    engine: "auto" as "auto" | "webSpeech" | "whisperWeb" | "whisperServer",
+    language: "auto",
+    mode: "toggle" as "toggle" | "hold",
+    autoSend: false,
+    whisperWebModel: "base" as "tiny" | "base" | "small",
+  },
+}));
+
+vi.mock("@/components/state-provider", () => ({
+  useAppStore: (
+    selector: (state: { userSettings: { voiceMode: typeof voicePrefs.value } }) => unknown,
+  ) => selector({ userSettings: { voiceMode: voicePrefs.value } }),
+}));
+
+const transcribeAudio = vi.hoisted(() => vi.fn());
+vi.mock("@/lib/api/domains/voice-api", () => ({ transcribeAudio }));
+
+// ── Mock SpeechRecognition ─────────────────────────────────────────────
+
+type SpeechHandle = {
+  start: () => void;
+  stop: () => void;
+  abort: () => void;
+  onresult: ((ev: { resultIndex: number; results: unknown }) => void) | null;
+  onerror: ((ev: { error: string }) => void) | null;
+  onend: (() => void) | null;
+  continuous: boolean;
+  interimResults: boolean;
+  maxAlternatives: number;
+  lang: string;
+  startCalls: number;
+  stopCalls: number;
+  abortCalls: number;
+};
+
+let recognitionInstance: SpeechHandle | null = null;
+
+// Factory pattern instead of `class` so we can avoid aliasing `this` in the
+// constructor (the lint rule disallows it) while still satisfying the
+// `new ()` shape that useVoiceInput's `new Ctor()` calls.
+function FakeSpeechRecognition() {
+  const handle: SpeechHandle = {
+    continuous: false,
+    interimResults: false,
+    maxAlternatives: 1,
+    lang: "",
+    onresult: null,
+    onerror: null,
+    onend: null,
+    startCalls: 0,
+    stopCalls: 0,
+    abortCalls: 0,
+    start() {
+      handle.startCalls += 1;
+    },
+    stop() {
+      handle.stopCalls += 1;
+    },
+    abort() {
+      handle.abortCalls += 1;
+    },
+  };
+  recognitionInstance = handle;
+  return handle;
+}
+
+// Import after mocks so the module under test sees the mocked store.
+import { useVoiceInput } from "./use-voice-input";
+
+// ── Tests ───────────────────────────────────────────────────────────────
+
+beforeEach(() => {
+  voicePrefs.value = {
+    engine: "auto",
+    language: "auto",
+    mode: "toggle",
+    autoSend: false,
+    whisperWebModel: "base",
+  };
+  recognitionInstance = null;
+  transcribeAudio.mockReset();
+  (window as unknown as { SpeechRecognition: unknown }).SpeechRecognition =
+    FakeSpeechRecognition as unknown as new () => SpeechHandle;
+  // MediaRecorder/getUserMedia not used in the auto→webSpeech path, but provide
+  // a stub so capability detection sees audioCapture available too.
+  (window as unknown as { MediaRecorder: { isTypeSupported: () => boolean } }).MediaRecorder = {
+    isTypeSupported: () => true,
+  };
+  Object.defineProperty(global.navigator, "mediaDevices", {
+    value: { getUserMedia: vi.fn() },
+    configurable: true,
+  });
+});
+
+afterEach(() => {
+  delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition;
+  delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition;
+  delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder;
+});
+
+describe("useVoiceInput — Web Speech engine", () => {
+  it("reports supported and resolves engine = webSpeech under the default auto preference", () => {
+    const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+    expect(result.current.supported).toBe(true);
+    expect(result.current.engine).toBe("webSpeech");
+  });
+
+  it("transitions idle → recording on start() and emits the final transcript on stop()", async () => {
+    const onTranscript = vi.fn();
+    const { result } = renderHook(() => useVoiceInput({ onTranscript }));
+
+    await act(async () => {
+      await result.current.start();
+    });
+    expect(result.current.state).toBe("recording");
+    expect(recognitionInstance?.startCalls).toBe(1);
+
+    act(() => {
+      recognitionInstance?.onresult?.({
+        resultIndex: 0,
+        results: {
+          length: 1,
+          0: { isFinal: true, length: 1, 0: { transcript: "hello world" } },
+        } as unknown,
+      });
+      recognitionInstance?.onend?.();
+    });
+
+    await waitFor(() => {
+      expect(onTranscript).toHaveBeenCalledWith("hello world");
+      expect(result.current.state).toBe("idle");
+    });
+  });
+
+  it("maps a not-allowed permission error to a permission-denied VoiceError", async () => {
+    const onError = vi.fn();
+    const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), onError }));
+
+    await act(async () => {
+      await result.current.start();
+    });
+    act(() => {
+      recognitionInstance?.onerror?.({ error: "not-allowed" });
+    });
+
+    expect(onError).toHaveBeenCalledWith({
+      code: "permission-denied",
+      message: "Microphone permission denied.",
+    });
+    expect(result.current.state).toBe("idle");
+  });
+});
+
+describe("useVoiceInput — capability gating", () => {
+  it("returns supported=false and engine=null when no engine is usable", () => {
+    delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition;
+    delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder;
+    Object.defineProperty(global.navigator, "mediaDevices", { value: {}, configurable: true });
+
+    const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+    expect(result.current.supported).toBe(false);
+    expect(result.current.engine).toBeNull();
+  });
+
+  it("disables the hook entirely when enabled=false", () => {
+    const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn(), enabled: false }));
+    expect(result.current.supported).toBe(false);
+    expect(result.current.engine).toBeNull();
+  });
+});
+
+describe("useVoiceInput — language preference", () => {
+  it("passes the pinned BCP-47 language to SpeechRecognition.lang", async () => {
+    voicePrefs.value = { ...voicePrefs.value, language: "pt-PT" };
+    const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+
+    await act(async () => {
+      await result.current.start();
+    });
+    expect(recognitionInstance?.lang).toBe("pt-PT");
+  });
+
+  it("falls back to navigator.language when 'auto'", async () => {
+    voicePrefs.value = { ...voicePrefs.value, language: "auto" };
+    Object.defineProperty(global.navigator, "language", { value: "fr-FR", configurable: true });
+    const { result } = renderHook(() => useVoiceInput({ onTranscript: vi.fn() }));
+    await act(async () => {
+      await result.current.start();
+    });
+    expect(recognitionInstance?.lang).toBe("fr-FR");
+  });
+});
diff --git a/apps/web/hooks/use-voice-input.ts b/apps/web/hooks/use-voice-input.ts
new file mode 100644
index 000000000..454df30f8
--- /dev/null
+++ b/apps/web/hooks/use-voice-input.ts
@@ -0,0 +1,493 @@
+"use client";
+
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { ApiError } from "@/lib/api/client";
+import { transcribeAudio } from "@/lib/api/domains/voice-api";
+import { detectVoiceCapabilities, resolveActiveEngine } from "@/lib/voice/capabilities";
+import { WhisperWebClient, type WhisperWebProgress } from "@/lib/voice/whisper-web-client";
+import { useAppStore } from "@/components/state-provider";
+import type { VoiceInputEngine, WhisperWebModelSize } from "@/lib/types/http-voice";
+
+// ── Public types ────────────────────────────────────────────────────────
+
+export type VoiceInputState = "idle" | "requesting" | "recording" | "processing";
+
+export type VoiceErrorCode =
+  | "permission-denied"
+  | "no-speech"
+  | "not-configured"
+  | "network"
+  | "unsupported"
+  | "model-load"
+  | "unknown";
+
+export type VoiceError = { code: VoiceErrorCode; message: string };
+
+export type VoiceModelLoadState = {
+  state: "idle" | "loading" | "ready" | "error";
+  progress: number;
+};
+
+export type UseVoiceInputOptions = {
+  onTranscript: (text: string) => void;
+  onError?: (error: VoiceError) => void;
+  /** Set false to disable the hook entirely (e.g. for read-only contexts). */
+  enabled?: boolean;
+};
+
+export type UseVoiceInputResult = {
+  supported: boolean;
+  engine: Exclude<VoiceInputEngine, "auto"> | null;
+  state: VoiceInputState;
+  error: VoiceError | null;
+  modelLoad: VoiceModelLoadState;
+  start: () => Promise<void>;
+  stop: () => Promise<void>;
+  cancel: () => void;
+};
+
+// ── Web Speech typings (DOM lib doesn't ship them) ─────────────────────
+
+type SpeechAlt = { transcript: string };
+type SpeechResult = { isFinal: boolean; 0: SpeechAlt; length: number };
+type SpeechResultList = { length: number; [index: number]: SpeechResult };
+type SpeechResultEvent = { resultIndex: number; results: SpeechResultList };
+type SpeechErrorEvent = { error: string; message?: string };
+type SpeechRecognitionInstance = {
+  lang: string;
+  continuous: boolean;
+  interimResults: boolean;
+  maxAlternatives: number;
+  start: () => void;
+  stop: () => void;
+  abort: () => void;
+  onresult: ((ev: SpeechResultEvent) => void) | null;
+  onerror: ((ev: SpeechErrorEvent) => void) | null;
+  onend: (() => void) | null;
+};
+
+type SpeechCtor = new () => SpeechRecognitionInstance;
+
+function createSpeechRecognition(): SpeechRecognitionInstance | null {
+  if (typeof window === "undefined") return null;
+  const w = window as Window & {
+    SpeechRecognition?: SpeechCtor;
+    webkitSpeechRecognition?: SpeechCtor;
+  };
+  const Ctor = w.SpeechRecognition ?? w.webkitSpeechRecognition;
+  return Ctor ? new Ctor() : null;
+}
+
+// ── Error mappers ───────────────────────────────────────────────────────
+
+function mapSpeechError(code: string): VoiceError {
+  if (code === "not-allowed" || code === "service-not-allowed") {
+    return { code: "permission-denied", message: "Microphone permission denied." };
+  }
+  if (code === "no-speech") return { code: "no-speech", message: "No speech detected. Try again." };
+  if (code === "network") {
+    return { code: "network", message: "Voice recognition lost network connection." };
+  }
+  if (code === "audio-capture") return { code: "unknown", message: "No microphone was found." };
+  return { code: "unknown", message: `Voice recognition error: ${code}` };
+}
+
+function mapMicError(err: unknown): VoiceError {
+  if (err && typeof err === "object" && "name" in err) {
+    const name = (err as { name: string }).name;
+    if (name === "NotAllowedError" || name === "SecurityError") {
+      return { code: "permission-denied", message: "Microphone permission denied." };
+    }
+    if (name === "NotFoundError" || name === "OverconstrainedError") {
+      return { code: "unknown", message: "No microphone was found." };
+    }
+  }
+  return { code: "unknown", message: "Failed to start recording." };
+}
+
+function mapTranscribeError(err: unknown): VoiceError {
+  if (err instanceof ApiError && err.status === 503) {
+    return {
+      code: "not-configured",
+      message:
+        "Server-side transcription isn't configured. Pick Web Speech or Whisper Web in Voice Mode settings.",
+    };
+  }
+  return { code: "network", message: "Transcription failed. Please try again." };
+}
+
+function whisperErrorMessage(err: unknown): VoiceError {
+  const message = err instanceof Error ? err.message : "Whisper Web failed to transcribe.";
+  return { code: "model-load", message };
+}
+
+function resolveLang(preference: string): string {
+  if (preference && preference !== "auto") return preference;
+  return typeof navigator !== "undefined" ? navigator.language : "en-US";
+}
+
+function resolveWhisperLang(preference: string): string | undefined {
+  if (!preference || preference === "auto") return undefined;
+  // Whisper's tokenizer only knows ISO 639-1 two-letter codes ("en", "pt").
+  // The settings UI stores BCP-47 ("en-US", "pt-BR") so we can render
+  // human-friendly variant names — strip the region suffix here so the hint
+  // isn't silently dropped by the pipeline (which would then auto-detect and
+  // potentially pick the wrong dialect).
+  const dash = preference.indexOf("-");
+  return dash > 0 ? preference.slice(0, dash).toLowerCase() : preference.toLowerCase();
+}
+
+// ── MediaRecorder capture primitive ─────────────────────────────────────
+
+function pickRecorderMime(): { mime: string; ext: string } {
+  if (typeof window === "undefined" || typeof window.MediaRecorder === "undefined") {
+    return { mime: "", ext: "webm" };
+  }
+  const candidates: Array<{ mime: string; ext: string }> = [
+    { mime: "audio/webm;codecs=opus", ext: "webm" },
+    { mime: "audio/webm", ext: "webm" },
+    { mime: "audio/mp4", ext: "m4a" },
+    { mime: "audio/ogg;codecs=opus", ext: "ogg" },
+    { mime: "audio/wav", ext: "wav" },
+  ];
+  for (const c of candidates) {
+    if (window.MediaRecorder.isTypeSupported(c.mime)) return c;
+  }
+  return { mime: "", ext: "webm" };
+}
+
+type CaptureHandle = {
+  stream: MediaStream;
+  recorder: MediaRecorder;
+  chunks: Blob[];
+  mime: string;
+  ext: string;
+};
+
+async function startCapture(): Promise<CaptureHandle> {
+  const { mime, ext } = pickRecorderMime();
+  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+  const recorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
+  const chunks: Blob[] = [];
+  recorder.addEventListener("dataavailable", (e) => {
+    if (e.data && e.data.size > 0) chunks.push(e.data);
+  });
+  recorder.start();
+  return { stream, recorder, chunks, mime, ext };
+}
+
+function teardownCapture(handle: CaptureHandle | null) {
+  if (!handle) return;
+  for (const t of handle.stream.getTracks()) t.stop();
+}
+
+function stopCapture(handle: CaptureHandle): Promise<Blob | null> {
+  return new Promise((resolve) => {
+    if (handle.recorder.state === "inactive") {
+      teardownCapture(handle);
+      resolve(null);
+      return;
+    }
+    handle.recorder.addEventListener(
+      "stop",
+      () => {
+        const type = handle.recorder.mimeType || handle.mime || "audio/webm";
+        const blob = handle.chunks.length > 0 ? new Blob(handle.chunks, { type }) : null;
+        teardownCapture(handle);
+        resolve(blob);
+      },
+      { once: true },
+    );
+    handle.recorder.stop();
+  });
+}
+
+// ── Driver refs ─────────────────────────────────────────────────────────
+
+type ActiveDriverRef =
+  | { kind: "webSpeech"; recognition: SpeechRecognitionInstance }
+  | { kind: "capture"; handle: CaptureHandle; engine: "whisperWeb" | "whisperServer" }
+  | null;
+
+type DriverRefBox = { current: ActiveDriverRef };
+type WhisperRefBox = { current: WhisperWebClient | null };
+
+function abortDriver(ref: DriverRefBox) {
+  const driver = ref.current;
+  if (!driver) return;
+  if (driver.kind === "webSpeech") {
+    // Detach callbacks before aborting so the trailing onerror/onend events
+    // that some browsers fire after .abort() don't sneak through and mutate
+    // hook state that the caller (cancel()) just reset.
+    driver.recognition.onresult = null;
+    driver.recognition.onerror = null;
+    driver.recognition.onend = null;
+    driver.recognition.abort();
+  } else teardownCapture(driver.handle);
+  ref.current = null;
+}
+
+// ── Web Speech driver ───────────────────────────────────────────────────
+
+type WebSpeechHandlers = {
+  setState: (s: VoiceInputState) => void;
+  driverRef: DriverRefBox;
+  emitError: (e: VoiceError) => void;
+  onTranscriptRef: { current: (text: string) => void };
+  lang: string;
+};
+
+function runWebSpeech(h: WebSpeechHandlers): void {
+  const recognition = createSpeechRecognition();
+  if (!recognition) {
+    h.emitError({ code: "unsupported", message: "Voice recognition is not supported." });
+    return;
+  }
+  const transcripts: string[] = [];
+  recognition.continuous = true;
+  recognition.interimResults = false;
+  recognition.maxAlternatives = 1;
+  recognition.lang = h.lang;
+  recognition.onresult = (ev) => {
+    for (let i = ev.resultIndex; i < ev.results.length; i++) {
+      const r = ev.results[i];
+      if (r.isFinal && r[0]?.transcript) transcripts.push(r[0].transcript.trim());
+    }
+  };
+  recognition.onerror = (ev) => h.emitError(mapSpeechError(ev.error));
+  recognition.onend = () => {
+    h.driverRef.current = null;
+    h.setState("idle");
+    const joined = transcripts.join(" ").trim();
+    if (joined) h.onTranscriptRef.current(joined);
+  };
+  try {
+    recognition.start();
+    h.driverRef.current = { kind: "webSpeech", recognition };
+    h.setState("recording");
+  } catch {
+    h.emitError({ code: "unknown", message: "Failed to start voice recognition." });
+  }
+}
+
+// ── Capture engines (whisperWeb + whisperServer) ───────────────────────
+
+type CaptureHandlers = {
+  setState: (s: VoiceInputState) => void;
+  emitError: (e: VoiceError) => void;
+  driverRef: DriverRefBox;
+};
+
+async function beginCapture(
+  which: "whisperWeb" | "whisperServer",
+  h: CaptureHandlers,
+): Promise<void> {
+  h.setState("requesting");
+  try {
+    const handle = await startCapture();
+    h.driverRef.current = { kind: "capture", handle, engine: which };
+    h.setState("recording");
+  } catch (err) {
+    h.emitError(mapMicError(err));
+  }
+}
+
+type FinishCaptureHandlers = {
+  driverRef: DriverRefBox;
+  whisperRef: WhisperRefBox;
+  setState: (s: VoiceInputState) => void;
+  setModelLoad: (next: VoiceModelLoadState) => void;
+  emitError: (e: VoiceError) => void;
+  onTranscriptRef: { current: (text: string) => void };
+  whisperModel: WhisperWebModelSize;
+  language: string;
+};
+
+async function finishCapture(h: FinishCaptureHandlers): Promise<void> {
+  const driver = h.driverRef.current;
+  if (!driver || driver.kind !== "capture") return;
+  // Claim the driver synchronously *before* the first await. In hold mode,
+  // pointerup + pointerleave both fire in the same task and both call stop();
+  // without this early null, the second invocation would also enter
+  // finishCapture, race the first, and could clobber a brand-new recording's
+  // driverRef if the user re-triggered between them.
+  h.driverRef.current = null;
+  h.setState("processing");
+  const blob = await stopCapture(driver.handle);
+  if (!blob) {
+    h.setState("idle");
+    return;
+  }
+  try {
+    const text =
+      driver.engine === "whisperServer"
+        ? await transcribeViaServer(blob, driver.handle.ext)
+        : await transcribeViaWhisperWeb(blob, h);
+    if (text) h.onTranscriptRef.current(text);
+    h.setState("idle");
+  } catch (err) {
+    if (driver.engine === "whisperServer") h.emitError(mapTranscribeError(err));
+    else h.emitError(whisperErrorMessage(err));
+  }
+}
+
+async function transcribeViaServer(blob: Blob, ext: string): Promise<string> {
+  const result = await transcribeAudio(blob, `recording.${ext}`);
+  return result.text.trim();
+}
+
+async function transcribeViaWhisperWeb(blob: Blob, h: FinishCaptureHandlers): Promise<string> {
+  const client = await ensureWhisperClient(h);
+  const text = await client.transcribe(blob, resolveWhisperLang(h.language));
+  return text.trim();
+}
+
+async function ensureWhisperClient(h: FinishCaptureHandlers): Promise<WhisperWebClient> {
+  if (!h.whisperRef.current) {
+    h.whisperRef.current = new WhisperWebClient({
+      onProgress: (p: WhisperWebProgress) =>
+        // transformers.js emits progress on a 0–100 scale, but the rest of the
+        // pipeline (and the button's `* 100` display) treats `modelLoad.progress`
+        // as a 0–1 fraction (matching the `ready: 1` convention below). Normalise
+        // here so the button doesn't render "5000%" mid-download.
+        h.setModelLoad({ state: "loading", progress: p.progress / 100 }),
+    });
+    h.setModelLoad({ state: "loading", progress: 0 });
+  }
+  try {
+    await h.whisperRef.current.init(h.whisperModel);
+    h.setModelLoad({ state: "ready", progress: 1 });
+  } catch (err) {
+    h.setModelLoad({ state: "error", progress: 0 });
+    throw err;
+  }
+  return h.whisperRef.current;
+}
+
+// ── Hook helpers ────────────────────────────────────────────────────────
+
+function useVoiceModePrefs() {
+  return useAppStore((s) => s.userSettings.voiceMode);
+}
+
+function useCallbackRefs(opts: UseVoiceInputOptions) {
+  const onTranscriptRef = useRef(opts.onTranscript);
+  const onErrorRef = useRef(opts.onError);
+  useEffect(() => {
+    onTranscriptRef.current = opts.onTranscript;
+    onErrorRef.current = opts.onError;
+  });
+  return { onTranscriptRef, onErrorRef };
+}
+
+// Re-init the whisper client whenever the user switches model size, so we
+// don't keep an old in-memory model around when the next start() runs.
+function useDisposeWhisperOnModelChange(
+  whisperRef: WhisperRefBox,
+  modelSize: string,
+  reset: () => void,
+) {
+  const previousModelRef = useRef(modelSize);
+  useEffect(() => {
+    if (previousModelRef.current === modelSize) return;
+    previousModelRef.current = modelSize;
+    whisperRef.current?.dispose();
+    whisperRef.current = null;
+    reset();
+  }, [modelSize, whisperRef, reset]);
+}
+
+function useUnmountCleanup(driverRef: DriverRefBox, whisperRef: WhisperRefBox) {
+  useEffect(() => {
+    return () => {
+      abortDriver(driverRef);
+      whisperRef.current?.dispose();
+      whisperRef.current = null;
+    };
+  }, [driverRef, whisperRef]);
+}
+
+// ── Hook ────────────────────────────────────────────────────────────────
+
+export function useVoiceInput(opts: UseVoiceInputOptions): UseVoiceInputResult {
+  const caps = useMemo(() => detectVoiceCapabilities(), []);
+  const prefs = useVoiceModePrefs();
+  const enabled = opts.enabled !== false;
+  const engine = useMemo(
+    () => (enabled ? resolveActiveEngine(prefs.engine, caps, true) : null),
+    [enabled, prefs.engine, caps],
+  );
+  const supported = engine !== null;
+
+  const [state, setState] = useState<VoiceInputState>("idle");
+  const [error, setError] = useState<VoiceError | null>(null);
+  const [modelLoad, setModelLoad] = useState<VoiceModelLoadState>({
+    state: "idle",
+    progress: 0,
+  });
+
+  const driverRef = useRef<ActiveDriverRef>(null);
+  const whisperRef = useRef<WhisperWebClient | null>(null);
+  const { onTranscriptRef, onErrorRef } = useCallbackRefs(opts);
+
+  const emitError = useCallback(
+    (e: VoiceError) => {
+      setError(e);
+      setState("idle");
+      onErrorRef.current?.(e);
+    },
+    [onErrorRef],
+  );
+
+  const resetModelLoad = useCallback(() => setModelLoad({ state: "idle", progress: 0 }), []);
+
+  useUnmountCleanup(driverRef, whisperRef);
+  useDisposeWhisperOnModelChange(whisperRef, prefs.whisperWebModel, resetModelLoad);
+
+  const start = useCallback(async () => {
+    if (!supported || !engine) {
+      emitError({ code: "unsupported", message: "Voice input is not supported in this browser." });
+      return;
+    }
+    if (state !== "idle") return;
+    setError(null);
+    if (engine === "webSpeech") {
+      runWebSpeech({
+        setState,
+        driverRef,
+        emitError,
+        onTranscriptRef,
+        lang: resolveLang(prefs.language),
+      });
+      return;
+    }
+    await beginCapture(engine, { setState, emitError, driverRef });
+  }, [supported, engine, state, emitError, prefs.language, onTranscriptRef]);
+
+  const stop = useCallback(async () => {
+    const driver = driverRef.current;
+    if (!driver) return;
+    if (driver.kind === "webSpeech") {
+      driver.recognition.stop();
+      return;
+    }
+    await finishCapture({
+      driverRef,
+      whisperRef,
+      setState,
+      setModelLoad,
+      emitError,
+      onTranscriptRef,
+      whisperModel: prefs.whisperWebModel,
+      language: prefs.language,
+    });
+  }, [emitError, prefs.whisperWebModel, prefs.language, onTranscriptRef]);
+
+  const cancel = useCallback(() => {
+    abortDriver(driverRef);
+    setState("idle");
+    setError(null);
+  }, []);
+
+  return { supported, engine, state, error, modelLoad, start, stop, cancel };
+}
diff --git a/apps/web/lib/api/domains/settings-api.ts b/apps/web/lib/api/domains/settings-api.ts
index 343e30efa..ec9b229be 100644
--- a/apps/web/lib/api/domains/settings-api.ts
+++ b/apps/web/lib/api/domains/settings-api.ts
@@ -21,6 +21,7 @@ import type {
   UserSettingsResponse,
   DynamicModelsResponse,
 } from "@/lib/types/http";
+import type { VoiceModeSettings } from "@/lib/types/http-voice";
 
 // User settings
 export async function fetchUserSettings(options?: ApiRequestOptions) {
@@ -52,6 +53,7 @@ export async function updateUserSettings(
     terminal_font_family?: string;
     terminal_font_size?: number;
     changes_panel_layout?: "flat" | "tree";
+    voice_mode?: VoiceModeSettings;
   },
   options?: ApiRequestOptions,
 ) {
diff --git a/apps/web/lib/api/domains/voice-api.test.ts b/apps/web/lib/api/domains/voice-api.test.ts
new file mode 100644
index 000000000..d3618cae8
--- /dev/null
+++ b/apps/web/lib/api/domains/voice-api.test.ts
@@ -0,0 +1,63 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { ApiError } from "../client";
+import { transcribeAudio } from "./voice-api";
+
+const originalFetch = global.fetch;
+
+describe("transcribeAudio", () => {
+  afterEach(() => {
+    global.fetch = originalFetch;
+  });
+
+  beforeEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it("posts multipart/form-data with the audio under the 'audio' field", async () => {
+    let capturedRequest: { method?: string; bodyText: string } = { bodyText: "" };
+    global.fetch = vi.fn(async (_url: RequestInfo | URL, init?: RequestInit) => {
+      capturedRequest = {
+        method: init?.method,
+        bodyText: init?.body instanceof FormData ? "<formdata>" : String(init?.body),
+      };
+      return new Response(JSON.stringify({ text: "hi" }), {
+        status: 200,
+        headers: { "Content-Type": "application/json" },
+      });
+    }) as unknown as typeof fetch;
+
+    const blob = new Blob([new Uint8Array([1, 2, 3])], { type: "audio/webm" });
+    const result = await transcribeAudio(blob, "clip.webm", {
+      baseUrl: "http://example.test",
+    });
+
+    expect(result.text).toBe("hi");
+    expect(capturedRequest.method).toBe("POST");
+    expect(capturedRequest.bodyText).toBe("<formdata>");
+  });
+
+  it("throws ApiError(503) when the server reports not-configured", async () => {
+    global.fetch = vi.fn(
+      async () =>
+        new Response(JSON.stringify({ error: "voice transcription is not configured" }), {
+          status: 503,
+        }),
+    ) as unknown as typeof fetch;
+
+    const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" });
+    await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toMatchObject({
+      status: 503,
+    });
+  });
+
+  it("surfaces non-2xx errors as ApiError instances", async () => {
+    global.fetch = vi.fn(
+      async () => new Response("bad", { status: 502, statusText: "Bad Gateway" }),
+    ) as unknown as typeof fetch;
+
+    const blob = new Blob([new Uint8Array([1])], { type: "audio/webm" });
+    await expect(transcribeAudio(blob, "x.webm", { baseUrl: "http://x" })).rejects.toBeInstanceOf(
+      ApiError,
+    );
+  });
+});
diff --git a/apps/web/lib/api/domains/voice-api.ts b/apps/web/lib/api/domains/voice-api.ts
new file mode 100644
index 000000000..d3af1a571
--- /dev/null
+++ b/apps/web/lib/api/domains/voice-api.ts
@@ -0,0 +1,51 @@
+import { ApiError, type ApiRequestOptions } from "../client";
+import { getBackendConfig } from "@/lib/config";
+
+export type TranscribeResponse = {
+  text: string;
+};
+
+/**
+ * POST audio to the backend Whisper fallback. Returns the transcribed text.
+ *
+ * Throws ApiError on non-2xx. Two statuses are meaningful to the caller:
+ *   - 503: server has no API key configured — the hook should treat the
+ *     Whisper fallback as unavailable and surface a clean message.
+ *   - any other non-2xx: transient error — show a generic toast.
+ */
+export async function transcribeAudio(
+  blob: Blob,
+  filename: string,
+  options?: ApiRequestOptions,
+): Promise<TranscribeResponse> {
+  const baseUrl = options?.baseUrl ?? getBackendConfig().apiBaseUrl;
+  const formData = new FormData();
+  formData.append("audio", blob, filename);
+
+  // Do NOT set Content-Type: the browser sets multipart/form-data with the
+  // correct boundary automatically when given a FormData body. Spread caller
+  // init *first* so method/body always win — otherwise a caller passing
+  // `init: { method: "GET" }` (or a stale body) would silently break the upload.
+  const response = await fetch(`${baseUrl}/api/v1/transcribe`, {
+    ...options?.init,
+    method: "POST",
+    body: formData,
+  });
+
+  if (!response.ok) {
+    let body: unknown = null;
+    try {
+      body = await response.json();
+    } catch {
+      // body remains null
+    }
+    let message = `Transcription failed: ${response.status} ${response.statusText}`;
+    if (body && typeof body === "object" && "error" in body) {
+      const errVal = (body as { error?: unknown }).error;
+      if (typeof errVal === "string") message = errVal;
+    }
+    throw new ApiError(message, response.status, body);
+  }
+
+  return (await response.json()) as TranscribeResponse;
+}
diff --git a/apps/web/lib/keyboard/constants.ts b/apps/web/lib/keyboard/constants.ts
index 31271c0b9..e05ab2373 100644
--- a/apps/web/lib/keyboard/constants.ts
+++ b/apps/web/lib/keyboard/constants.ts
@@ -153,4 +153,10 @@ export const SHORTCUTS = {
     key: KEYS.F,
     modifiers: { ctrlOrCmd: true },
   },
+  // Cmd+Shift+M starts/stops voice input on the chat composer. The default
+  // is configurable per-user via the Voice Mode settings page.
+  VOICE_INPUT_TOGGLE: {
+    key: KEYS.M,
+    modifiers: { ctrlOrCmd: true, shift: true },
+  },
 } as const;
diff --git a/apps/web/lib/keyboard/shortcut-overrides.test.ts b/apps/web/lib/keyboard/shortcut-overrides.test.ts
index 6453bc902..43c59c3df 100644
--- a/apps/web/lib/keyboard/shortcut-overrides.test.ts
+++ b/apps/web/lib/keyboard/shortcut-overrides.test.ts
@@ -20,7 +20,8 @@ describe("CONFIGURABLE_SHORTCUTS", () => {
     expect(ids).toContain("FOCUS_INPUT");
     expect(ids).toContain("TOGGLE_PLAN_MODE");
     expect(ids).toContain("TASK_SWITCHER");
-    expect(ids).toHaveLength(10);
+    expect(ids).toContain("VOICE_INPUT_TOGGLE");
+    expect(ids).toHaveLength(11);
   });
 
   it("each entry has a label and default matching SHORTCUTS", () => {
diff --git a/apps/web/lib/keyboard/shortcut-overrides.ts b/apps/web/lib/keyboard/shortcut-overrides.ts
index 8ac1b7a37..a31d61e15 100644
--- a/apps/web/lib/keyboard/shortcut-overrides.ts
+++ b/apps/web/lib/keyboard/shortcut-overrides.ts
@@ -10,7 +10,8 @@ export type ConfigurableShortcutId =
   | "NEW_TASK"
   | "FOCUS_INPUT"
   | "TOGGLE_PLAN_MODE"
-  | "TASK_SWITCHER";
+  | "TASK_SWITCHER"
+  | "VOICE_INPUT_TOGGLE";
 
 export type StoredShortcutOverrides = Record<
   string,
@@ -31,6 +32,7 @@ export const CONFIGURABLE_SHORTCUTS: Record<
   FOCUS_INPUT: { label: "Focus Chat Input", default: SHORTCUTS.FOCUS_INPUT },
   TOGGLE_PLAN_MODE: { label: "Toggle Plan Mode", default: SHORTCUTS.TOGGLE_PLAN_MODE },
   TASK_SWITCHER: { label: "Recent Task Switcher", default: SHORTCUTS.TASK_SWITCHER },
+  VOICE_INPUT_TOGGLE: { label: "Voice Input", default: SHORTCUTS.VOICE_INPUT_TOGGLE },
 };
 
 export function getShortcut(
diff --git a/apps/web/lib/ssr/user-settings.test.ts b/apps/web/lib/ssr/user-settings.test.ts
index 04f425b0d..38b681b6e 100644
--- a/apps/web/lib/ssr/user-settings.test.ts
+++ b/apps/web/lib/ssr/user-settings.test.ts
@@ -1,5 +1,10 @@
 import { describe, it, expect } from "vitest";
-import { buildCoreFields, mapUserSettingsResponse, parseChangesPanelLayout } from "./user-settings";
+import {
+  buildCoreFields,
+  mapUserSettingsResponse,
+  parseChangesPanelLayout,
+  parseVoiceMode,
+} from "./user-settings";
 
 describe("buildCoreFields", () => {
   it("maps terminal_font_family to terminalFontFamily", () => {
@@ -103,3 +108,78 @@ describe("parseChangesPanelLayout", () => {
     expect(parseChangesPanelLayout("")).toBe("flat");
   });
 });
+
+describe("parseVoiceMode", () => {
+  it("maps every field from the snake_case wire payload", () => {
+    expect(
+      parseVoiceMode({
+        enabled: false,
+        engine: "whisperWeb",
+        language: "pt-PT",
+        mode: "hold",
+        auto_send: true,
+        whisper_web_model: "small",
+      }),
+    ).toEqual({
+      enabled: false,
+      engine: "whisperWeb",
+      language: "pt-PT",
+      mode: "hold",
+      autoSend: true,
+      whisperWebModel: "small",
+    });
+  });
+
+  it("returns the defaults when the payload is undefined", () => {
+    expect(parseVoiceMode(undefined)).toEqual({
+      enabled: true,
+      engine: "auto",
+      language: "auto",
+      mode: "toggle",
+      autoSend: false,
+      whisperWebModel: "base",
+    });
+  });
+
+  it("defaults enabled to true when the wire payload omits the field (old rows)", () => {
+    const result = parseVoiceMode({
+      engine: "auto",
+      language: "auto",
+      mode: "toggle",
+      auto_send: false,
+      whisper_web_model: "base",
+    } as unknown as Parameters<typeof parseVoiceMode>[0]);
+    expect(result.enabled).toBe(true);
+  });
+
+  it("fills in defaults for missing string fields and coerces auto_send to false", () => {
+    const result = parseVoiceMode({
+      engine: "" as unknown as "auto",
+      language: "",
+      mode: "" as unknown as "toggle",
+      whisper_web_model: "" as unknown as "base",
+    } as unknown as Parameters<typeof parseVoiceMode>[0]);
+    expect(result).toEqual({
+      enabled: true,
+      engine: "auto",
+      language: "auto",
+      mode: "toggle",
+      autoSend: false,
+      whisperWebModel: "base",
+    });
+  });
+});
+
+describe("mapUserSettingsResponse voice mode", () => {
+  it("defaults the whole voiceMode object when response is null", () => {
+    const result = mapUserSettingsResponse(null);
+    expect(result.voiceMode).toEqual({
+      enabled: true,
+      engine: "auto",
+      language: "auto",
+      mode: "toggle",
+      autoSend: false,
+      whisperWebModel: "base",
+    });
+  });
+});
diff --git a/apps/web/lib/ssr/user-settings.ts b/apps/web/lib/ssr/user-settings.ts
index 74a3d127f..b2ed73508 100644
--- a/apps/web/lib/ssr/user-settings.ts
+++ b/apps/web/lib/ssr/user-settings.ts
@@ -1,6 +1,8 @@
 import { fromApiSidebarView } from "@/lib/state/slices/ui/sidebar-view-wire";
 import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types";
+import { DEFAULT_VOICE_MODE_STATE, type VoiceModeState } from "@/lib/state/slices/settings/types";
 import type { SavedLayout, UserSettingsResponse } from "@/lib/types/http";
+import type { VoiceModeSettings } from "@/lib/types/http-voice";
 
 export type UserSettingsData = NonNullable<UserSettingsResponse["settings"]>;
 
@@ -12,6 +14,25 @@ export function parseChangesPanelLayout(value: string | undefined): "flat" | "tr
   return value === "tree" ? "tree" : "flat";
 }
 
+/**
+ * Maps the backend's snake_case VoiceMode payload into the camelCase shape
+ * the store and UI use. Missing or partial payloads fall back to the defaults
+ * so an old user row (written before VoiceMode existed) doesn't surface as
+ * an empty string the radio groups can't render. `enabled` defaults to true
+ * for users who haven't toggled it — voice mode is opt-out, not opt-in.
+ */
+export function parseVoiceMode(value: VoiceModeSettings | undefined): VoiceModeState {
+  if (!value) return { ...DEFAULT_VOICE_MODE_STATE };
+  return {
+    enabled: typeof value.enabled === "boolean" ? value.enabled : true,
+    engine: value.engine || DEFAULT_VOICE_MODE_STATE.engine,
+    language: value.language || DEFAULT_VOICE_MODE_STATE.language,
+    mode: value.mode || DEFAULT_VOICE_MODE_STATE.mode,
+    autoSend: typeof value.auto_send === "boolean" ? value.auto_send : false,
+    whisperWebModel: value.whisper_web_model || DEFAULT_VOICE_MODE_STATE.whisperWebModel,
+  };
+}
+
 function buildTerminalFields(s: UserSettingsData) {
   return {
     terminalLinkBehavior: parseTerminalLinkBehavior(s.terminal_link_behavior),
@@ -21,6 +42,10 @@ function buildTerminalFields(s: UserSettingsData) {
   };
 }
 
+function buildVoiceModeFields(s: UserSettingsData) {
+  return { voiceMode: parseVoiceMode(s.voice_mode) };
+}
+
 function buildIdentityFields(s: UserSettingsData) {
   return {
     workspaceId: s.workspace_id || null,
@@ -51,6 +76,7 @@ export function buildCoreFields(s: UserSettingsData) {
     savedLayouts: s.saved_layouts ?? [],
     sidebarViews: (s.sidebar_views ?? []).map(fromApiSidebarView) as SidebarView[],
     ...buildTerminalFields(s),
+    ...buildVoiceModeFields(s),
   };
 }
 
@@ -91,6 +117,7 @@ export function mapUserSettingsResponse(response: UserSettingsResponse | null) {
       terminalFontFamily: null,
       terminalFontSize: null,
       changesPanelLayout: "flat" as const,
+      voiceMode: { ...DEFAULT_VOICE_MODE_STATE },
       ...buildLspFields(undefined),
       loaded: false,
     };
diff --git a/apps/web/lib/state/slices/settings/settings-slice.ts b/apps/web/lib/state/slices/settings/settings-slice.ts
index 26ce9c67b..d9dca4acb 100644
--- a/apps/web/lib/state/slices/settings/settings-slice.ts
+++ b/apps/web/lib/state/slices/settings/settings-slice.ts
@@ -1,5 +1,5 @@
 import type { StateCreator } from "zustand";
-import type { SettingsSlice, SettingsSliceState } from "./types";
+import { DEFAULT_VOICE_MODE_STATE, type SettingsSlice, type SettingsSliceState } from "./types";
 
 export const defaultSettingsState: SettingsSliceState = {
   executors: { items: [] },
@@ -44,6 +44,7 @@ export const defaultSettingsState: SettingsSliceState = {
     terminalFontFamily: null,
     terminalFontSize: null,
     changesPanelLayout: "flat",
+    voiceMode: { ...DEFAULT_VOICE_MODE_STATE },
     loaded: false,
   },
 };
diff --git a/apps/web/lib/state/slices/settings/types.ts b/apps/web/lib/state/slices/settings/types.ts
index 73f094761..ca7740a93 100644
--- a/apps/web/lib/state/slices/settings/types.ts
+++ b/apps/web/lib/state/slices/settings/types.ts
@@ -11,6 +11,11 @@ import type {
   SavedLayout,
   ToolStatus,
 } from "@/lib/types/http";
+import type {
+  VoiceInputActivationMode,
+  VoiceInputEngine,
+  WhisperWebModelSize,
+} from "@/lib/types/http-voice";
 import type { SidebarView } from "@/lib/state/slices/ui/sidebar-view-types";
 import type { SecretListItem } from "@/lib/types/http-secrets";
 import type { SpritesStatus, SpritesInstance } from "@/lib/types/http-sprites";
@@ -156,9 +161,29 @@ export type UserSettingsState = {
   terminalFontFamily: string | null;
   terminalFontSize: number | null;
   changesPanelLayout: "flat" | "tree";
+  voiceMode: VoiceModeState;
   loaded: boolean;
 };
 
+export type VoiceModeState = {
+  enabled: boolean;
+  engine: VoiceInputEngine;
+  language: string;
+  mode: VoiceInputActivationMode;
+  autoSend: boolean;
+  whisperWebModel: WhisperWebModelSize;
+};
+
+/** Default values used by the slice init and by SSR hydration fallback. */
+export const DEFAULT_VOICE_MODE_STATE: VoiceModeState = {
+  enabled: true,
+  engine: "auto",
+  language: "auto",
+  mode: "toggle",
+  autoSend: false,
+  whisperWebModel: "base",
+};
+
 export type SettingsSliceState = {
   executors: ExecutorsState;
   settingsAgents: SettingsAgentsState;
diff --git a/apps/web/lib/types/backend.ts b/apps/web/lib/types/backend.ts
index c97912e3c..448dba0f9 100644
--- a/apps/web/lib/types/backend.ts
+++ b/apps/web/lib/types/backend.ts
@@ -383,6 +383,7 @@ export type UserSettingsUpdatedPayload = {
   keyboard_shortcuts?: Record<string, { key: string; modifiers?: Record<string, boolean> }>;
   terminal_link_behavior?: string;
   changes_panel_layout?: "flat" | "tree";
+  voice_mode?: import("@/lib/types/http-voice").VoiceModeSettings;
   updated_at?: string;
 };
 
diff --git a/apps/web/lib/types/http-voice.ts b/apps/web/lib/types/http-voice.ts
new file mode 100644
index 000000000..c43351524
--- /dev/null
+++ b/apps/web/lib/types/http-voice.ts
@@ -0,0 +1,17 @@
+/**
+ * Wire types for the Voice Mode user settings. Kept in their own module so
+ * http.ts stays under the 600-line file limit.
+ */
+
+export type VoiceInputEngine = "auto" | "webSpeech" | "whisperWeb" | "whisperServer";
+export type VoiceInputActivationMode = "toggle" | "hold";
+export type WhisperWebModelSize = "tiny" | "base" | "small";
+
+export type VoiceModeSettings = {
+  enabled: boolean;
+  engine: VoiceInputEngine;
+  language: string;
+  mode: VoiceInputActivationMode;
+  auto_send: boolean;
+  whisper_web_model: WhisperWebModelSize;
+};
diff --git a/apps/web/lib/types/http.ts b/apps/web/lib/types/http.ts
index fae94bf0c..0953be4c3 100644
--- a/apps/web/lib/types/http.ts
+++ b/apps/web/lib/types/http.ts
@@ -406,6 +406,8 @@ export type SidebarViewApi = {
   collapsed_groups: string[];
 };
 
+import type { VoiceModeSettings } from "./http-voice";
+
 export type UserSettings = {
   user_id: string;
   workspace_id: WorkspaceId;
@@ -432,6 +434,7 @@ export type UserSettings = {
   terminal_font_family?: string;
   terminal_font_size?: number;
   changes_panel_layout?: "flat" | "tree";
+  voice_mode?: VoiceModeSettings;
   updated_at: string;
 };
 
diff --git a/apps/web/lib/voice/capabilities.test.ts b/apps/web/lib/voice/capabilities.test.ts
new file mode 100644
index 000000000..d8b8d7191
--- /dev/null
+++ b/apps/web/lib/voice/capabilities.test.ts
@@ -0,0 +1,97 @@
+import { describe, it, expect, afterEach, vi } from "vitest";
+import { detectVoiceCapabilities, resolveActiveEngine } from "./capabilities";
+
+describe("detectVoiceCapabilities", () => {
+  afterEach(() => {
+    vi.unstubAllGlobals();
+    delete (window as unknown as { SpeechRecognition?: unknown }).SpeechRecognition;
+    delete (window as unknown as { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition;
+    delete (window as unknown as { MediaRecorder?: unknown }).MediaRecorder;
+  });
+
+  it("reports webSpeech true when window.SpeechRecognition exists", () => {
+    (window as unknown as { SpeechRecognition: () => void }).SpeechRecognition = () => {};
+    expect(detectVoiceCapabilities().webSpeech).toBe(true);
+  });
+
+  it("reports webSpeech true on the prefixed webkit variant too", () => {
+    (window as unknown as { webkitSpeechRecognition: () => void }).webkitSpeechRecognition =
+      () => {};
+    expect(detectVoiceCapabilities().webSpeech).toBe(true);
+  });
+
+  it("reports audioCapture true when MediaRecorder + getUserMedia are present", () => {
+    (window as unknown as { MediaRecorder: object }).MediaRecorder = {
+      isTypeSupported: () => true,
+    };
+    vi.stubGlobal("navigator", { mediaDevices: { getUserMedia: () => Promise.resolve({}) } });
+    expect(detectVoiceCapabilities().audioCapture).toBe(true);
+  });
+
+  it("reports everything false when no APIs are available", () => {
+    vi.stubGlobal("navigator", {});
+    expect(detectVoiceCapabilities()).toEqual({
+      webSpeech: false,
+      whisperWeb: false,
+      audioCapture: false,
+    });
+  });
+});
+
+describe("resolveActiveEngine", () => {
+  const allAvailable = { webSpeech: true, whisperWeb: true, audioCapture: true };
+
+  it("auto picks webSpeech first when available", () => {
+    expect(resolveActiveEngine("auto", allAvailable, true)).toBe("webSpeech");
+  });
+
+  it("auto falls back to whisperWeb when webSpeech is missing", () => {
+    expect(
+      resolveActiveEngine("auto", { webSpeech: false, whisperWeb: true, audioCapture: true }, true),
+    ).toBe("whisperWeb");
+  });
+
+  it("auto falls back to whisperServer when no in-browser engine is available", () => {
+    expect(
+      resolveActiveEngine(
+        "auto",
+        { webSpeech: false, whisperWeb: false, audioCapture: true },
+        true,
+      ),
+    ).toBe("whisperServer");
+  });
+
+  it("returns null when nothing is usable", () => {
+    expect(
+      resolveActiveEngine(
+        "auto",
+        { webSpeech: false, whisperWeb: false, audioCapture: false },
+        true,
+      ),
+    ).toBeNull();
+  });
+
+  it("honors a pinned engine when usable", () => {
+    expect(resolveActiveEngine("whisperWeb", allAvailable, true)).toBe("whisperWeb");
+  });
+
+  it("falls back along the auto order when the pinned engine is missing", () => {
+    expect(
+      resolveActiveEngine(
+        "whisperWeb",
+        { webSpeech: true, whisperWeb: false, audioCapture: true },
+        true,
+      ),
+    ).toBe("webSpeech");
+  });
+
+  it("treats whisperServer as unusable when serverFallbackEnabled is false", () => {
+    expect(
+      resolveActiveEngine(
+        "whisperServer",
+        { webSpeech: false, whisperWeb: false, audioCapture: true },
+        false,
+      ),
+    ).toBeNull();
+  });
+});
diff --git a/apps/web/lib/voice/capabilities.ts b/apps/web/lib/voice/capabilities.ts
new file mode 100644
index 000000000..6fd36f161
--- /dev/null
+++ b/apps/web/lib/voice/capabilities.ts
@@ -0,0 +1,75 @@
+"use client";
+
+import type { VoiceInputEngine } from "@/lib/types/http-voice";
+
+/**
+ * Capability report for the voice-mode engines available in the current
+ * browser. Shared between `useVoiceInput` (which picks the active engine)
+ * and the Voice Mode settings page (which decides which options to render).
+ */
+export type VoiceCapabilities = {
+  webSpeech: boolean;
+  whisperWeb: boolean;
+  /** True if the browser supports MediaRecorder + getUserMedia, the floor
+   *  for any audio-capture engine (whisperWeb + whisperServer). */
+  audioCapture: boolean;
+};
+
+/**
+ * Detects which voice engines this browser can run. Safe to call during
+ * SSR — returns all-false instead of throwing on missing globals.
+ */
+export function detectVoiceCapabilities(): VoiceCapabilities {
+  if (typeof window === "undefined") {
+    return { webSpeech: false, whisperWeb: false, audioCapture: false };
+  }
+  const w = window as Window & {
+    SpeechRecognition?: unknown;
+    webkitSpeechRecognition?: unknown;
+  };
+  const webSpeech = !!(w.SpeechRecognition || w.webkitSpeechRecognition);
+  const audioCapture =
+    typeof navigator !== "undefined" &&
+    typeof navigator.mediaDevices?.getUserMedia === "function" &&
+    typeof window.MediaRecorder !== "undefined";
+  // whisper-web piggybacks on transformers.js which only needs a Worker plus
+  // either WebGPU or WebAssembly. Every modern browser has both, so the
+  // gating constraint is having MediaRecorder for capture.
+  const whisperWeb = audioCapture && typeof Worker !== "undefined";
+  return { webSpeech, whisperWeb, audioCapture };
+}
+
+/**
+ * Resolves the active voice-input engine given a user preference and the
+ * detected capabilities. Returns null when nothing usable is available.
+ *
+ * Auto-fallback order: Web Speech (cheapest, native) → Whisper Web (private,
+ * heavier) → Whisper Server (always works but requires a configured server).
+ * If the user pinned a specific engine that isn't available, we degrade
+ * gracefully along the same order.
+ */
+export function resolveActiveEngine(
+  preference: VoiceInputEngine,
+  caps: VoiceCapabilities,
+  serverFallbackEnabled: boolean,
+): Exclude<VoiceInputEngine, "auto"> | null {
+  const order: Array<Exclude<VoiceInputEngine, "auto">> = [
+    "webSpeech",
+    "whisperWeb",
+    "whisperServer",
+  ];
+
+  const isUsable = (e: Exclude<VoiceInputEngine, "auto">) => {
+    if (e === "webSpeech") return caps.webSpeech;
+    if (e === "whisperWeb") return caps.whisperWeb;
+    return caps.audioCapture && serverFallbackEnabled;
+  };
+
+  if (preference === "auto") {
+    return order.find(isUsable) ?? null;
+  }
+  if (isUsable(preference)) return preference;
+  // Pinned engine isn't usable — fall through to the next available one in
+  // the auto order so the button still works instead of silently no-op.
+  return order.find(isUsable) ?? null;
+}
diff --git a/apps/web/lib/voice/whisper-web-client.ts b/apps/web/lib/voice/whisper-web-client.ts
new file mode 100644
index 000000000..e9d1cc620
--- /dev/null
+++ b/apps/web/lib/voice/whisper-web-client.ts
@@ -0,0 +1,199 @@
+"use client";
+
+import { whisperModelConfig } from "./whisper-web-models";
+import type { WhisperWebModelSize } from "@/lib/types/http-voice";
+
+/**
+ * Sample rate Whisper expects. We resample the captured audio to this rate
+ * (mono Float32Array) before sending to the worker — Whisper's own decoder
+ * would do this too, but doing it here keeps the worker focused on inference.
+ */
+const WHISPER_SAMPLE_RATE = 16000;
+
+export type WhisperWebProgress = {
+  stage: string;
+  progress: number;
+};
+
+export type WhisperWebHandlers = {
+  onProgress?: (p: WhisperWebProgress) => void;
+};
+
+type WorkerMessage =
+  | { type: "progress"; stage: string; progress: number }
+  | { type: "ready" }
+  | { type: "result"; text: string }
+  | { type: "error"; message: string };
+
+type Pending = {
+  kind: "init" | "transcribe";
+  resolve: (value: string | undefined) => void;
+  reject: (err: Error) => void;
+};
+
+/**
+ * Client wrapper around the whisper-web worker. Hides the postMessage
+ * protocol behind a clean promise-based API and handles the audio decode +
+ * resample step so callers only see "Blob in, transcript out".
+ */
+export class WhisperWebClient {
+  private worker: Worker | null = null;
+  private pending: Pending | null = null;
+  private ready = false;
+  private loadingModelId: string | null = null;
+
+  constructor(private handlers: WhisperWebHandlers = {}) {}
+
+  /**
+   * Lazy-creates the worker on first use. Returns a promise that resolves
+   * when the requested model is loaded and ready to transcribe.
+   */
+  async init(size: WhisperWebModelSize): Promise<void> {
+    const config = whisperModelConfig(size);
+    if (this.ready && this.loadingModelId === config.modelId) return;
+    this.ensureWorker();
+    this.loadingModelId = config.modelId;
+    this.ready = false;
+    await this.send({ kind: "init", payload: { type: "init", model: config.modelId } });
+    this.ready = true;
+  }
+
+  /**
+   * Transcribe a recorded blob. The blob may be in any container the browser
+   * can decode (audio/webm, audio/wav, audio/mp4, …) — we resample everything
+   * to 16 kHz mono Float32 before handing to the worker.
+   */
+  async transcribe(blob: Blob, language?: string): Promise<string> {
+    if (!this.ready || !this.worker) {
+      throw new Error("WhisperWebClient: not initialized");
+    }
+    const audio = await blobToWhisperFloat32(blob);
+    const text = await this.send({
+      kind: "transcribe",
+      payload: { type: "transcribe", audio, language },
+      transfer: [audio.buffer],
+    });
+    return text ?? "";
+  }
+
+  /** Tear down the worker and release the loaded model. */
+  dispose(): void {
+    if (this.worker) {
+      try {
+        this.worker.postMessage({ type: "dispose" });
+      } catch {
+        // ignore
+      }
+      this.worker.terminate();
+      this.worker = null;
+    }
+    this.ready = false;
+    this.loadingModelId = null;
+    if (this.pending) {
+      this.pending.reject(new Error("WhisperWebClient disposed"));
+      this.pending = null;
+    }
+  }
+
+  private ensureWorker() {
+    if (this.worker) return;
+    // The `new Worker(new URL(..., import.meta.url))` form is Next.js / webpack's
+    // recommended pattern — webpack handles the bundling and asset path.
+    this.worker = new Worker(new URL("../../workers/whisper-web.worker.ts", import.meta.url), {
+      type: "module",
+    });
+    this.worker.addEventListener("message", (e: MessageEvent<WorkerMessage>) =>
+      this.handleMessage(e.data),
+    );
+    // Capture the worker reference at listener-attach time. A late error from
+    // a previously-disposed worker can still bubble up after we've already
+    // created its replacement; without the identity check below, that stale
+    // event would terminate the brand-new worker too.
+    const ownWorker = this.worker;
+    this.worker.addEventListener("error", (e) => {
+      const err = new Error(e.message || "Whisper worker crashed");
+      ownWorker?.terminate();
+      // Only clear our refs if this is still the active worker — a stale
+      // error from a worker we already replaced must not nuke the new one.
+      if (this.worker === ownWorker) {
+        this.worker = null;
+        this.ready = false;
+        this.loadingModelId = null;
+      }
+      if (this.pending) {
+        this.pending.reject(err);
+        this.pending = null;
+      }
+    });
+  }
+
+  private send(args: {
+    kind: "init" | "transcribe";
+    payload: object;
+    transfer?: Transferable[];
+  }): Promise<string | undefined> {
+    if (!this.worker) throw new Error("WhisperWebClient: worker not initialized");
+    if (this.pending) {
+      return Promise.reject(new Error("WhisperWebClient: another request is in flight"));
+    }
+    return new Promise<string | undefined>((resolve, reject) => {
+      this.pending = { kind: args.kind, resolve, reject };
+      this.worker?.postMessage(args.payload, args.transfer ?? []);
+    });
+  }
+
+  private handleMessage(msg: WorkerMessage) {
+    if (msg.type === "progress") {
+      this.handlers.onProgress?.({ stage: msg.stage, progress: msg.progress });
+      return;
+    }
+    const pending = this.pending;
+    if (!pending) return;
+    this.pending = null;
+    if (msg.type === "error") {
+      pending.reject(new Error(msg.message));
+      return;
+    }
+    if (msg.type === "ready") {
+      pending.resolve(undefined);
+      return;
+    }
+    if (msg.type === "result") {
+      pending.resolve(msg.text);
+    }
+  }
+}
+
+/**
+ * Decode an arbitrary audio Blob and return a Float32Array sampled at 16 kHz
+ * mono — the format Whisper expects.
+ */
+export async function blobToWhisperFloat32(blob: Blob): Promise<Float32Array> {
+  const arrayBuffer = await blob.arrayBuffer();
+  // Decode using an AudioContext at the source rate, then bounce through an
+  // OfflineAudioContext for the resample. AudioContext.decodeAudioData
+  // tolerates webm/opus, mp4/aac, wav, ogg — anything the browser can play.
+  const AudioCtor =
+    window.AudioContext ??
+    (window as unknown as { webkitAudioContext?: typeof AudioContext }).webkitAudioContext;
+  if (!AudioCtor) throw new Error("AudioContext is not available in this browser");
+  const decodeCtx = new AudioCtor();
+  let decoded: AudioBuffer;
+  try {
+    decoded = await decodeCtx.decodeAudioData(arrayBuffer);
+  } finally {
+    await decodeCtx.close();
+  }
+  return resampleToMono16k(decoded);
+}
+
+async function resampleToMono16k(buf: AudioBuffer): Promise<Float32Array> {
+  const length = Math.ceil((buf.duration * WHISPER_SAMPLE_RATE) / 1);
+  const offline = new OfflineAudioContext(1, length, WHISPER_SAMPLE_RATE);
+  const source = offline.createBufferSource();
+  source.buffer = buf;
+  source.connect(offline.destination);
+  source.start(0);
+  const rendered = await offline.startRendering();
+  return rendered.getChannelData(0).slice();
+}
diff --git a/apps/web/lib/voice/whisper-web-models.ts b/apps/web/lib/voice/whisper-web-models.ts
new file mode 100644
index 000000000..eaffe6698
--- /dev/null
+++ b/apps/web/lib/voice/whisper-web-models.ts
@@ -0,0 +1,42 @@
+import type { WhisperWebModelSize } from "@/lib/types/http-voice";
+
+export type WhisperModelConfig = {
+  size: WhisperWebModelSize;
+  /** Hugging Face model id. Use the `onnx-community/*` mirrors — `Xenova/*`
+   *  defaults to 4-bit MatMulNBits weights that crash on WASM (see note below). */
+  modelId: string;
+  /** Rough on-disk size after download, shown in the settings UI. */
+  approxBytes: number;
+  /** Human-readable label. */
+  label: string;
+};
+
+// The `onnx-community/whisper-*` mirrors are the maintained transformers.js
+// exports. The older `Xenova/whisper-*` mirrors default to 4-bit (`MatMulNBits`)
+// weights that only run on WebGPU — on WASM they fail with
+// `Missing required scale: ... weight_merged_0_scale`. The onnx-community
+// mirrors include the q8 variant we pin to in the worker.
+export const WHISPER_WEB_MODELS: Record<WhisperWebModelSize, WhisperModelConfig> = {
+  tiny: {
+    size: "tiny",
+    modelId: "onnx-community/whisper-tiny",
+    approxBytes: 40 * 1024 * 1024,
+    label: "Whisper Tiny",
+  },
+  base: {
+    size: "base",
+    modelId: "onnx-community/whisper-base",
+    approxBytes: 75 * 1024 * 1024,
+    label: "Whisper Base",
+  },
+  small: {
+    size: "small",
+    modelId: "onnx-community/whisper-small",
+    approxBytes: 240 * 1024 * 1024,
+    label: "Whisper Small",
+  },
+};
+
+export function whisperModelConfig(size: WhisperWebModelSize): WhisperModelConfig {
+  return WHISPER_WEB_MODELS[size] ?? WHISPER_WEB_MODELS.base;
+}
diff --git a/apps/web/lib/ws/handlers/users.ts b/apps/web/lib/ws/handlers/users.ts
index 1ddb7a71c..0b33698d3 100644
--- a/apps/web/lib/ws/handlers/users.ts
+++ b/apps/web/lib/ws/handlers/users.ts
@@ -1,6 +1,7 @@
 import type { StoreApi } from "zustand";
 import type { AppState } from "@/lib/state/store";
 import type { WsHandlers } from "@/lib/ws/handlers/types";
+import { parseVoiceMode } from "@/lib/ssr/user-settings";
 
 export function registerUsersHandlers(store: StoreApi<AppState>): WsHandlers {
   return {
@@ -31,6 +32,7 @@ export function registerUsersHandlers(store: StoreApi<AppState>): WsHandlers {
               ? "browser_panel"
               : "new_tab",
           changesPanelLayout: message.payload.changes_panel_layout === "tree" ? "tree" : "flat",
+          voiceMode: parseVoiceMode(message.payload.voice_mode),
           loaded: true,
         },
       }));
diff --git a/apps/web/package.json b/apps/web/package.json
index 7de93f075..369517e61 100644
--- a/apps/web/package.json
+++ b/apps/web/package.json
@@ -38,6 +38,7 @@
     "@dnd-kit/core": "^6.3.1",
     "@dnd-kit/sortable": "^10.0.0",
     "@dnd-kit/utilities": "^3.2.2",
+    "@huggingface/transformers": "^4.2.0",
     "@kandev/theme": "workspace:*",
     "@kandev/types": "workspace:*",
     "@kandev/ui": "workspace:*",
diff --git a/apps/web/workers/whisper-web.worker.ts b/apps/web/workers/whisper-web.worker.ts
new file mode 100644
index 000000000..68fa33e4b
--- /dev/null
+++ b/apps/web/workers/whisper-web.worker.ts
@@ -0,0 +1,138 @@
+/// <reference lib="webworker" />
+
+/**
+ * Web Worker that runs OpenAI Whisper entirely in the browser via
+ * @huggingface/transformers (the maintained transformers.js library that
+ * xenova/whisper-web is built on).
+ *
+ * Lives in its own worker because model loading + inference both block the
+ * main thread for several seconds — would freeze the chat input otherwise.
+ *
+ * Wire protocol (postMessage):
+ *   in:  { type: "init",       model: "onnx-community/whisper-base" }
+ *   in:  { type: "transcribe", audio: Float32Array, language?: string }
+ *   in:  { type: "dispose" }
+ *   out: { type: "progress",   stage: string, progress: number }
+ *   out: { type: "ready" }
+ *   out: { type: "result",     text: string }
+ *   out: { type: "error",      message: string }
+ */
+
+import { pipeline, env, type AutomaticSpeechRecognitionPipeline } from "@huggingface/transformers";
+
+// Disable transformers.js's local-models lookup — we only load from the HF
+// CDN so the worker doesn't try to fetch files from our own origin.
+env.allowLocalModels = false;
+env.allowRemoteModels = true;
+
+type InitMessage = { type: "init"; model: string };
+type TranscribeMessage = { type: "transcribe"; audio: Float32Array; language?: string };
+type DisposeMessage = { type: "dispose" };
+type InMessage = InitMessage | TranscribeMessage | DisposeMessage;
+
+type OutMessage =
+  | { type: "progress"; stage: string; progress: number }
+  | { type: "ready" }
+  | { type: "result"; text: string }
+  | { type: "error"; message: string };
+
+const ctx = self as unknown as DedicatedWorkerGlobalScope;
+
+let asrPipeline: AutomaticSpeechRecognitionPipeline | null = null;
+let activeModelId: string | null = null;
+
+function post(message: OutMessage) {
+  ctx.postMessage(message);
+}
+
+type ProgressEvent = {
+  status?: string;
+  file?: string;
+  progress?: number;
+};
+
+async function handleInit(msg: InitMessage) {
+  if (asrPipeline && activeModelId === msg.model) {
+    post({ type: "ready" });
+    return;
+  }
+  if (asrPipeline) {
+    await asrPipeline.dispose();
+    asrPipeline = null;
+  }
+  try {
+    // dtype choice rationale: the `_quantized` / `q8` and `q4` decoder weights
+    // for whisper-base both contain `MatMulNBits` ops that only execute on
+    // WebGPU. On browsers without WebGPU (most Firefox, older Chrome) onnxruntime
+    // throws `Missing required scale: ... weight_merged_0_scale`. fp16 has no
+    // quantized ops at all so it works on both WASM and WebGPU; it's ~half the
+    // size of fp32 with no perceptible accuracy loss for ASR.
+    const created = await pipeline("automatic-speech-recognition", msg.model, {
+      dtype: {
+        encoder_model: "fp32",
+        decoder_model_merged: "fp16",
+      },
+      progress_callback: (e: ProgressEvent) => {
+        if (typeof e?.progress === "number") {
+          post({
+            type: "progress",
+            stage: e.status ?? "download",
+            progress: e.progress,
+          });
+        }
+      },
+    });
+    asrPipeline = created as AutomaticSpeechRecognitionPipeline;
+    activeModelId = msg.model;
+    post({ type: "ready" });
+  } catch (err) {
+    post({ type: "error", message: errorMessage(err) });
+  }
+}
+
+async function handleTranscribe(msg: TranscribeMessage) {
+  if (!asrPipeline) {
+    post({ type: "error", message: "Whisper worker not initialized" });
+    return;
+  }
+  try {
+    const result = (await asrPipeline(msg.audio, {
+      language: msg.language && msg.language !== "auto" ? msg.language : undefined,
+      task: "transcribe",
+    })) as { text?: string } | Array<{ text?: string }>;
+    const text = Array.isArray(result)
+      ? result.map((r) => r.text ?? "").join(" ")
+      : (result.text ?? "");
+    post({ type: "result", text: text.trim() });
+  } catch (err) {
+    post({ type: "error", message: errorMessage(err) });
+  }
+}
+
+async function handleDispose() {
+  if (asrPipeline) {
+    await asrPipeline.dispose();
+    asrPipeline = null;
+    activeModelId = null;
+  }
+}
+
+function errorMessage(err: unknown): string {
+  if (err instanceof Error) return err.message;
+  return String(err);
+}
+
+ctx.addEventListener("message", (event: MessageEvent<InMessage>) => {
+  const msg = event.data;
+  switch (msg.type) {
+    case "init":
+      void handleInit(msg);
+      break;
+    case "transcribe":
+      void handleTranscribe(msg);
+      break;
+    case "dispose":
+      void handleDispose();
+      break;
+  }
+});