Krinxy · Krinxy · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@ dist/
 build/
 coverage/*
 !coverage/coverage_gate.py
+frontend/coverage/
 .coverage
 .coverage.*
 

diff --git a/README.md b/README.md
@@ -41,9 +41,18 @@ Current repository quality pattern:
 Frontend steering highlights:
 
 - New chat creation is handled from the sidebar Current Session action.
-- Voice input uses browser microphone permission and speech recognition.
+- Voice input triggers a dedicated speech service from the microphone button and streams text back live.
 - Profile includes a help/forum section for feature lookup.
 
+Local voice stack launch:
+
+1. `npm run dev`
+
+Manual fallback (two terminals):
+
+1. `npm --prefix backend run dev`
+2. `npm --prefix frontend run dev`
+
 ## Quality Gates For Main
 
 The main branch is protected by required CI checks:

diff --git a/backend/__init__.py b/backend/__init__.py
@@ -0,0 +1 @@
+"""Backend runtime package for Chat Assistant AI."""
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
@@ -0,0 +1 @@
+"""FastAPI application package."""
diff --git a/backend/app/cache/__init__.py b/backend/app/cache/__init__.py
@@ -0,0 +1 @@
+"""Cache abstractions for speech transcription sessions."""
diff --git a/backend/app/handlers/__init__.py b/backend/app/handlers/__init__.py
@@ -0,0 +1 @@
+"""Request/connection handlers for backend entrypoints."""
diff --git a/backend/app/main.py b/backend/app/main.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+import os
+from typing import AsyncIterator
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from starlette.concurrency import run_in_threadpool
+
+from .services.core.chat.transcription.websocket import router as transcription_router
+from .services.utils.transcription.preflight import (
+    print_preflight_report,
+    run_transcription_preflight,
+)
+
+
+def _parse_bool_env(name: str, default_value: bool) -> bool:
+    raw_value = os.getenv(name, "").strip().lower()
+    if len(raw_value) == 0:
+        return default_value
+
+    return raw_value in {"1", "true", "yes", "on"}
+
+
+def create_app() -> FastAPI:
+    @asynccontextmanager
+    async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
+        preload_on_startup = _parse_bool_env("TRANSCRIPTION_PRELOAD_ON_STARTUP", True)
+        report = await run_in_threadpool(lambda: run_transcription_preflight(preload_runtime=preload_on_startup))
+        print_preflight_report(report)
+        yield
+
+    app = FastAPI(
+        title="Chat Assistant AI Speech Backend",
+        version="0.1.0",
+        lifespan=lifespan,
+    )
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    @app.get("/health")
+    async def health() -> dict[str, str]:
+        return {"status": "ok"}
+
+    app.include_router(transcription_router)
+    return app
+
+
+app = create_app()
diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py
@@ -0,0 +1 @@
+"""Service layer for backend runtime orchestration."""
diff --git a/backend/app/services/core/__init__.py b/backend/app/services/core/__init__.py
@@ -0,0 +1 @@
+"""Core backend services package."""
diff --git a/backend/app/services/core/chat/__init__.py b/backend/app/services/core/chat/__init__.py
@@ -0,0 +1 @@
+"""Core chat service modules."""
diff --git a/backend/app/services/core/chat/transcription/__init__.py b/backend/app/services/core/chat/transcription/__init__.py
@@ -0,0 +1 @@
+"""Core speech transcription runtime components."""
diff --git a/backend/app/services/core/chat/transcription/handler.py b/backend/app/services/core/chat/transcription/handler.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import time
+from typing import Any, Callable
+
+from starlette.concurrency import run_in_threadpool
+
+from ....dependency.transcription.speech_cache import SpeechTranscriptionCache, speech_cache
+from .runtime import invoke_local_transcription_lambda, runtime_service
+from .transcriber import WhisperDependenciesMissingError, WhisperInferenceError
+
+RuntimeInvoker = Callable[[bytes, str | None, str | None], str]
+
+
+class LiveTranscriptionHandler:
+    """Connection-level orchestration for websocket speech transcription."""
+
+    def __init__(
+        self,
+        runtime_invoker: RuntimeInvoker | None = None,
+        cache_backend: SpeechTranscriptionCache | None = None,
+    ) -> None:
+        self._runtime_invoker = runtime_invoker or invoke_local_transcription_lambda
+        self._runtime_service = runtime_service if runtime_invoker is None else None
+        self._cache_backend = cache_backend or speech_cache
+
+    def open_session(self) -> str:
+        return self._cache_backend.create_session_id()
+
+    def close_session(self, session_id: str) -> None:
+        self._cache_backend.clear_session(session_id)
+
+    def recommended_max_inflight_chunks(self) -> int:
+        if self._runtime_service is None:
+            return 1
+
+        return self._runtime_service.recommended_max_inflight_chunks()
+
+    @staticmethod
+    def normalize_language(language: str | None) -> str | None:
+        if language is None:
+            return None
+
+        normalized = language.strip().lower()
+        if len(normalized) == 0:
+            return None
+
+        if "-" in normalized:
+            return normalized.split("-", maxsplit=1)[0]
+
+        return normalized
+
+    async def transcribe_chunk(
+        self,
+        *,
+        session_id: str,
+        audio_chunk: bytes,
+        chunk_index: int,
+        language: str | None,
+        mime_type: str | None = None,
+    ) -> dict[str, Any]:
+        self._cache_backend.store_received_chunk(
+            session_id=session_id,
+            chunk_index=chunk_index,
+            audio_chunk=audio_chunk,
+            language=language,
+        )
+
+        started_at = time.perf_counter()
+
+        try:
+            text = await run_in_threadpool(
+                self._runtime_invoker,
+                audio_chunk,
+                language,
+                mime_type,
+            )
+        except WhisperDependenciesMissingError:
+            return {
+                "type": "error",
+                "message": 'Whisper dependencies missing. Install with: pip install -e ".[backend]"',
+            }
+        except WhisperInferenceError:
+            return {
+                "type": "empty",
+                "chunk_index": chunk_index,
+                "latency_ms": int((time.perf_counter() - started_at) * 1000),
+            }
+
+        latency_ms = int((time.perf_counter() - started_at) * 1000)
+
+        normalized_text = text.strip()
+        if len(normalized_text) == 0:
+            return {
+                "type": "empty",
+                "chunk_index": chunk_index,
+                "latency_ms": latency_ms,
+            }
+
+        self._cache_backend.store_transcript(
+            session_id=session_id,
+            chunk_index=chunk_index,
+            transcript=normalized_text,
+            latency_ms=latency_ms,
+        )
+
+        return {
+            "type": "transcript",
+            "text": normalized_text,
+            "chunk_index": chunk_index,
+            "is_final": True,
+            "latency_ms": latency_ms,
+        }
diff --git a/backend/app/services/core/chat/transcription/runtime.py b/backend/app/services/core/chat/transcription/runtime.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from threading import Lock
+
+from .transcriber import WhisperChunkTranscriber
+
+
+@dataclass(frozen=True)
+class TranscriptionInvokeEvent:
+    audio_chunk: bytes
+    language: str | None = None
+    mime_type: str | None = None
+
+
+class TranscriptionRuntimeService:
+    """Runtime service that behaves like a local lambda invocation target."""
+
+    def __init__(self, transcriber: WhisperChunkTranscriber | None = None) -> None:
+        self._transcriber = transcriber or WhisperChunkTranscriber()
+        self._accelerated_invoke_lock = Lock()
+
+    def preload(self) -> None:
+        self._transcriber.ensure_loaded()
+
+    @property
+    def runtime_device(self) -> str | None:
+        runtime_device = getattr(self._transcriber, "runtime_device", None)
+        return runtime_device if isinstance(runtime_device, str) else None
+
+    def recommended_max_inflight_chunks(self) -> int:
+        device = self.runtime_device
+        if device is None:
+            return 1
+
+        normalized = device.lower()
+        if normalized.startswith("cuda") or normalized.startswith("mps") or normalized.startswith("xpu"):
+            return 1
+
+        return 2
+
+    def invoke(self, event: TranscriptionInvokeEvent) -> str:
+        # Accelerated backends can be unstable when sharing one pipeline instance across threads.
+        device = self.runtime_device
+        if device is not None:
+            normalized = device.lower()
+            if normalized.startswith("cuda") or normalized.startswith("mps") or normalized.startswith("xpu"):
+                with self._accelerated_invoke_lock:
+                    return self._transcriber.transcribe_chunk(
+                        audio_chunk=event.audio_chunk,
+                        language=event.language,
+                        mime_type=event.mime_type,
+                    )
+
+        return self._transcriber.transcribe_chunk(
+            audio_chunk=event.audio_chunk,
+            language=event.language,
+            mime_type=event.mime_type,
+        )
+
+
+runtime_service = TranscriptionRuntimeService()
+
+
+def invoke_local_transcription_lambda(
+    audio_chunk: bytes,
+    language: str | None = None,
+    mime_type: str | None = None,
+) -> str:
+    event = TranscriptionInvokeEvent(
+        audio_chunk=audio_chunk,
+        language=language,
+        mime_type=mime_type,
+    )
+    return runtime_service.invoke(event)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Backend runtime package for Chat Assistant AI."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Cache abstractions for speech transcription sessions."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Request/connection handlers for backend entrypoints."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Service layer for backend runtime orchestration."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Core speech transcription runtime components."""