Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions apps/backend/cmd/kandev/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ import (
userhandlers "github.com/kandev/kandev/internal/user/handlers"
utilitycontroller "github.com/kandev/kandev/internal/utility/controller"
utilityhandlers "github.com/kandev/kandev/internal/utility/handlers"
voicehandlers "github.com/kandev/kandev/internal/voice/handlers"
"github.com/kandev/kandev/internal/voice/transcribe"
workflowcontroller "github.com/kandev/kandev/internal/workflow/controller"
workflowhandlers "github.com/kandev/kandev/internal/workflow/handlers"
"github.com/kandev/kandev/internal/worktree"
Expand Down Expand Up @@ -449,6 +451,7 @@ type routeParams struct {
devMode bool
httpPort int
features config.FeaturesConfig
voice config.VoiceConfig
log *logger.Logger
}

Expand Down Expand Up @@ -698,6 +701,11 @@ func registerSecondaryRoutes(
utilityhandlers.RegisterRoutes(p.router, p.utilityCtrl, p.lifecycleMgr, p.hostUtilityMgr, p.services.User, p.log)
p.log.Debug("Registered Utility Agents handlers (HTTP)")

// Voice transcription fallback. The route always mounts, but returns 503
// when no API key is configured so the frontend can hide the path.
voicehandlers.RegisterRoutes(p.router, transcribe.New(p.voice.OpenAIAPIKey), p.log)
p.log.Debug("Registered Voice handlers (HTTP)")

agentcapabilities.RegisterRoutes(p.router, p.hostUtilityMgr, p.log)
p.log.Debug("Registered Agent Capabilities handlers (HTTP)")

Expand Down
1 change: 1 addition & 0 deletions apps/backend/cmd/kandev/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1513,6 +1513,7 @@ func buildHTTPServer(
devMode: cfg.Debug.DevMode || cfg.Debug.PprofEnabled,
httpPort: port,
features: cfg.Features,
voice: cfg.Voice,
log: log,
})

Expand Down
19 changes: 19 additions & 0 deletions apps/backend/internal/common/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ type Config struct {
RepoClone RepoCloneConfig `mapstructure:"repoClone"`
Debug DebugConfig `mapstructure:"debug"`
Office OfficeConfig `mapstructure:"office"`
Voice VoiceConfig `mapstructure:"voice"`
Features FeaturesConfig `mapstructure:"features"`
}

Expand Down Expand Up @@ -147,6 +148,20 @@ type OfficeConfig struct {
JWTSigningKey string `mapstructure:"jwtSigningKey"`
}

// VoiceConfig holds configuration for the chat voice-input transcription
// fallback. The primary voice-input engine runs entirely in the browser
// (Web Speech API); this server-side fallback is only used when the browser
// has no SpeechRecognition support (e.g. Firefox).
//
// When OpenAIAPIKey is empty the /api/v1/transcribe endpoint returns 503
// and the frontend hides the fallback path, so the feature is safe to
// ship un-configured.
type VoiceConfig struct {
// OpenAIAPIKey is the API key used to call OpenAI's Whisper transcription
// endpoint. Set via KANDEV_VOICE_OPENAI_API_KEY.
OpenAIAPIKey string `mapstructure:"openAIApiKey"`
}

// FeaturesConfig is the central registry of runtime feature flags. Every flag
// defaults to false so production binaries ship with new work hidden until a
// deployment explicitly opts in (env var, e.g. KANDEV_FEATURES_OFFICE=true).
Expand Down Expand Up @@ -312,6 +327,9 @@ func setDefaults(v *viper.Viper) {
// Office defaults
v.SetDefault("office.jwtSigningKey", "")

// Voice defaults
v.SetDefault("voice.openAIApiKey", "")

// Feature-flag defaults live in ./features.yaml (symlinked to
// apps/backend/internal/features/features.yaml). LoadWithPath applies
// them via features.ApplyDefaults after this function returns so the
Expand Down Expand Up @@ -428,6 +446,7 @@ func LoadWithPath(configPath string) (*Config, error) {
_ = v.BindEnv("events.namespace", "KANDEV_EVENTS_NAMESPACE")
_ = v.BindEnv("debug.devMode", "KANDEV_DEBUG_DEV_MODE")
_ = v.BindEnv("debug.pprofEnabled", "KANDEV_DEBUG_PPROF_ENABLED")
_ = v.BindEnv("voice.openAIApiKey", "KANDEV_VOICE_OPENAI_API_KEY")

// Configure config file
v.SetConfigName("config")
Expand Down
1 change: 1 addition & 0 deletions apps/backend/internal/user/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func (c *Controller) UpdateUserSettings(ctx context.Context, req dto.UpdateUserS
TerminalFontFamily: req.TerminalFontFamily,
TerminalFontSize: req.TerminalFontSize,
ChangesPanelLayout: req.ChangesPanelLayout,
VoiceMode: req.VoiceMode,
})
if err != nil {
return dto.UserSettingsResponse{}, err
Expand Down
3 changes: 3 additions & 0 deletions apps/backend/internal/user/dto/dto.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type UserSettingsDTO struct {
TerminalFontFamily string `json:"terminal_font_family"`
TerminalFontSize int `json:"terminal_font_size"`
ChangesPanelLayout string `json:"changes_panel_layout"`
VoiceMode models.VoiceModeSettings `json:"voice_mode"`
UpdatedAt string `json:"updated_at"`
}

Expand Down Expand Up @@ -82,6 +83,7 @@ type UpdateUserSettingsRequest struct {
TerminalFontFamily *string `json:"terminal_font_family,omitempty"`
TerminalFontSize *int `json:"terminal_font_size,omitempty"`
ChangesPanelLayout *string `json:"changes_panel_layout,omitempty"`
VoiceMode *models.VoiceModeSettings `json:"voice_mode,omitempty"`
}

func FromUser(user *models.User) UserDTO {
Expand Down Expand Up @@ -120,6 +122,7 @@ func FromUserSettings(settings *models.UserSettings) UserSettingsDTO {
TerminalFontFamily: settings.TerminalFontFamily,
TerminalFontSize: settings.TerminalFontSize,
ChangesPanelLayout: settings.ChangesPanelLayout,
VoiceMode: settings.VoiceMode,
UpdatedAt: settings.UpdatedAt.Format(time.RFC3339),
}
}
26 changes: 26 additions & 0 deletions apps/backend/internal/user/models/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,36 @@ type UserSettings struct {
TerminalFontFamily string `json:"terminal_font_family"`
TerminalFontSize int `json:"terminal_font_size"`
ChangesPanelLayout string `json:"changes_panel_layout"` // "flat" | "tree"
VoiceMode VoiceModeSettings `json:"voice_mode"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}

// VoiceModeSettings is the per-user configuration surface for the chat
// voice-input feature. Stored as a nested JSON object inside the `users.settings`
// blob — adding fields here does not require a schema migration.
type VoiceModeSettings struct {
// Enabled gates the whole feature. When false, the mic button is hidden
// entirely and no voice-related hooks run on the chat input. Defaults to
// true for new users; pre-existing user rows that have no `enabled` field
// in their stored JSON are also treated as enabled (see store layer).
Enabled bool `json:"enabled"`
Comment thread
jcfs marked this conversation as resolved.
// Engine is the user's preferred transcription engine.
// "auto" | "webSpeech" | "whisperWeb" | "whisperServer". Default "auto".
Engine string `json:"engine"`
// Language is the BCP-47 tag or "auto" to use the browser's language.
// Examples: "en-US", "pt-PT", "ja-JP". Default "auto".
Language string `json:"language"`
// Mode controls how the mic button is activated: "toggle" (click to start/stop)
// or "hold" (push-to-talk). Default "toggle".
Mode string `json:"mode"`
// AutoSend submits the chat message immediately after the transcript is inserted.
AutoSend bool `json:"auto_send"`
// WhisperWebModel selects the in-browser Whisper model when engine = whisperWeb.
// "tiny" | "base" | "small". Default "base".
WhisperWebModel string `json:"whisper_web_model"`
}

// SavedLayout represents a user-saved dockview layout configuration.
type SavedLayout struct {
ID string `json:"id"`
Expand Down
65 changes: 65 additions & 0 deletions apps/backend/internal/user/service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ type UpdateUserSettingsRequest struct {
TerminalFontFamily *string
TerminalFontSize *int
ChangesPanelLayout *string
VoiceMode *models.VoiceModeSettings
}

func NewService(repo store.Repository, eventBus bus.EventBus, log *logger.Logger) *Service {
Expand Down Expand Up @@ -122,6 +123,9 @@ func (s *Service) UpdateUserSettings(ctx context.Context, req *UpdateUserSetting
if err := applySidebarViews(settings, req); err != nil {
return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error())
}
if err := applyVoiceMode(settings, req.VoiceMode); err != nil {
return nil, fmt.Errorf("%w: %s", ErrValidation, err.Error())
}
settings.UpdatedAt = time.Now().UTC()
if err := s.repo.UpsertUserSettings(ctx, settings); err != nil {
return nil, err
Expand Down Expand Up @@ -220,6 +224,66 @@ func applyChangesPanelLayout(settings *models.UserSettings, value *string) error
return nil
}

var (
validVoiceEngines = map[string]struct{}{
"auto": {},
"webSpeech": {},
"whisperWeb": {},
"whisperServer": {},
}
validVoiceModes = map[string]struct{}{
"toggle": {},
"hold": {},
}
validWhisperWebModels = map[string]struct{}{
"tiny": {},
"base": {},
"small": {},
}
)

// applyVoiceMode validates the inbound voice-mode settings and merges them
// onto the user record. Each sub-field is validated independently so a
// partial update (e.g. just `engine`) still works.
//
// `enabled` and `auto_send` are plain bools — every PATCH carries them. The
// settings UI always sends the full VoiceMode object so partial updates that
// would otherwise zero these are not a real concern.
func applyVoiceMode(settings *models.UserSettings, value *models.VoiceModeSettings) error {
if value == nil {
return nil
}
current := settings.VoiceMode
if current.Engine == "" {
current.Engine = "auto"
}
if value.Engine != "" {
if _, ok := validVoiceEngines[value.Engine]; !ok {
return errors.New("voice_mode.engine must be 'auto', 'webSpeech', 'whisperWeb', or 'whisperServer'")
}
current.Engine = value.Engine
}
if value.Language != "" {
current.Language = strings.TrimSpace(value.Language)
}
if value.Mode != "" {
if _, ok := validVoiceModes[value.Mode]; !ok {
return errors.New("voice_mode.mode must be 'toggle' or 'hold'")
}
current.Mode = value.Mode
}
if value.WhisperWebModel != "" {
if _, ok := validWhisperWebModels[value.WhisperWebModel]; !ok {
return errors.New("voice_mode.whisper_web_model must be 'tiny', 'base', or 'small'")
}
current.WhisperWebModel = value.WhisperWebModel
}
current.AutoSend = value.AutoSend
current.Enabled = value.Enabled
settings.VoiceMode = current
return nil
}

// applyChatSubmitKey validates and applies the chat_submit_key setting.
func (s *Service) applyChatSubmitKey(settings *models.UserSettings, req *UpdateUserSettingsRequest) error {
if req.ChatSubmitKey == nil {
Expand Down Expand Up @@ -332,6 +396,7 @@ func (s *Service) publishUserSettingsEvent(ctx context.Context, settings *models
"terminal_font_family": settings.TerminalFontFamily,
"terminal_font_size": settings.TerminalFontSize,
"changes_panel_layout": settings.ChangesPanelLayout,
"voice_mode": settings.VoiceMode,
"updated_at": settings.UpdatedAt.Format(time.RFC3339),
}
if err := s.eventBus.Publish(ctx, events.UserSettingsUpdated, bus.NewEvent(events.UserSettingsUpdated, "user-service", data)); err != nil {
Expand Down
111 changes: 111 additions & 0 deletions apps/backend/internal/user/service/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,114 @@ func TestApplySidebarViews(t *testing.T) {
})
}
}

func TestApplyVoiceMode(t *testing.T) {
t.Run("nil value leaves settings unchanged", func(t *testing.T) {
settings := &models.UserSettings{
VoiceMode: models.VoiceModeSettings{Engine: "webSpeech", Language: "en-US"},
}
if err := applyVoiceMode(settings, nil); err != nil {
t.Fatalf("unexpected error: %v", err)
}
if settings.VoiceMode.Engine != "webSpeech" || settings.VoiceMode.Language != "en-US" {
t.Fatalf("expected unchanged, got %+v", settings.VoiceMode)
}
})

t.Run("happy path: applies a full update", func(t *testing.T) {
settings := &models.UserSettings{}
err := applyVoiceMode(settings, &models.VoiceModeSettings{
Enabled: true,
Engine: "whisperWeb",
Language: "pt-PT",
Mode: "hold",
AutoSend: true,
WhisperWebModel: "small",
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
want := models.VoiceModeSettings{
Enabled: true,
Engine: "whisperWeb",
Language: "pt-PT",
Mode: "hold",
AutoSend: true,
WhisperWebModel: "small",
}
if settings.VoiceMode != want {
t.Fatalf("expected %+v, got %+v", want, settings.VoiceMode)
}
})

t.Run("enabled=false is honored (user disabled the feature)", func(t *testing.T) {
settings := &models.UserSettings{VoiceMode: models.VoiceModeSettings{Enabled: true}}
if err := applyVoiceMode(settings, &models.VoiceModeSettings{Enabled: false}); err != nil {
t.Fatalf("unexpected error: %v", err)
}
if settings.VoiceMode.Enabled {
t.Fatalf("expected Enabled=false after disable, got true")
}
})

t.Run("invalid engine is rejected", func(t *testing.T) {
err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Engine: "bogus"})
if err == nil || !strings.Contains(err.Error(), "voice_mode.engine") {
t.Fatalf("expected engine validation error, got %v", err)
}
})

t.Run("invalid mode is rejected", func(t *testing.T) {
err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{Mode: "tap"})
if err == nil || !strings.Contains(err.Error(), "voice_mode.mode") {
t.Fatalf("expected mode validation error, got %v", err)
}
})

t.Run("invalid whisper_web_model is rejected", func(t *testing.T) {
err := applyVoiceMode(&models.UserSettings{}, &models.VoiceModeSettings{WhisperWebModel: "huge"})
if err == nil || !strings.Contains(err.Error(), "voice_mode.whisper_web_model") {
t.Fatalf("expected model validation error, got %v", err)
}
})

t.Run("partial update preserves string fields but zeroes booleans", func(t *testing.T) {
settings := &models.UserSettings{
VoiceMode: models.VoiceModeSettings{
Enabled: true,
Engine: "whisperServer",
Language: "en-GB",
Mode: "toggle",
AutoSend: true,
WhisperWebModel: "tiny",
},
}
// Empty strings on the new payload mean "no change" for the string fields,
// but bools have no "unset" sentinel — every PATCH carries them. The settings
// UI always sends the full VoiceMode object so partial updates here would
// only happen in test or hand-crafted requests; the assertions below lock in
// that explicit behavior so it doesn't drift silently.
err := applyVoiceMode(settings, &models.VoiceModeSettings{Engine: "webSpeech"})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if settings.VoiceMode.Engine != "webSpeech" {
t.Fatalf("expected engine=webSpeech, got %q", settings.VoiceMode.Engine)
}
if settings.VoiceMode.Language != "en-GB" {
t.Fatalf("expected language preserved, got %q", settings.VoiceMode.Language)
}
if settings.VoiceMode.Mode != "toggle" {
t.Fatalf("expected mode preserved, got %q", settings.VoiceMode.Mode)
}
if settings.VoiceMode.WhisperWebModel != "tiny" {
t.Fatalf("expected whisper model preserved, got %q", settings.VoiceMode.WhisperWebModel)
}
if settings.VoiceMode.Enabled {
t.Fatalf("expected Enabled zeroed on partial update, got true")
}
if settings.VoiceMode.AutoSend {
t.Fatalf("expected AutoSend zeroed on partial update, got true")
}
})
}
Loading
Loading