thewafflehaus · TheTom · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/Sources/FFAI/KVCache/AURACodebook.swift b/Sources/FFAI/KVCache/AURACodebook.swift
@@ -20,7 +20,7 @@
 // the coordinate distribution of unit-sphere vectors converges to a
 // near-Gaussian, so a fixed Lloyd-Max table is near-optimal.
 //
-// The reference values here are mined from llama.cpp's `k_quants`
+// The reference values here are mined from the reference C++ `k_quants`
 // tables (empirically optimal for unit-norm Gaussian data at d=128)
 // and scaled to other head dims by √(128 / dim) — a heuristic that
 // approximates the analytic 1/√d Beta-variance scaling from the
@@ -246,6 +246,56 @@ public enum AURACodebook {
         return base.map { $0 * scale }
     }
 
+    /// Allocate a codebook tensor in the requested activation dtype.
+    /// AURA cache stores codebook in the same dtype as the model
+    /// activations so both encode + decode kernels (which take
+    /// `Tensor<T>` for the codebook) read directly with no per-call
+    /// cast. The Lloyd-Max values themselves are computed in Float;
+    /// narrow dtypes (`bf16`/`f16`) round at the CPU-side host conversion.
+    public static func centroidsTensor(
+        dim: Int, bits: Int, dtype: DType, device: Device = .shared
+    ) -> Tensor {
+        let values = centroids(dim: dim, bits: bits)
+        return writeFloatsToTensor(values, shape: [values.count], dtype: dtype, device: device)
+    }
+
+    /// Allocate a boundaries tensor in the requested activation dtype.
+    /// Post-metaltile #226, `aura_encode` takes `boundaries: Tensor<T>`
+    /// — kernel-side bandwidth win (Π + boundaries dominate the encode
+    /// kernel's memory traffic). Lloyd-Max boundary values are computed
+    /// in Float; narrow dtypes (bf16/f16) round at the host-side
+    /// conversion. The bf16/f16 rounding (~1e-3) sits well below the
+    /// 2-4-bit quant bin so the matched-norm correction stays stable.
+    public static func boundariesTensor(
+        dim: Int, bits: Int, dtype: DType, device: Device = .shared
+    ) -> Tensor {
+        let values = boundaries(dim: dim, bits: bits)
+        return writeFloatsToTensor(values, shape: [values.count], dtype: dtype, device: device)
+    }
+
+    /// CPU-side host conversion from `[Float]` into a tensor of the
+    /// requested float dtype. Used by `centroidsTensor` and any caller
+    /// that needs Lloyd-Max-precise values landed into narrow storage.
+    private static func writeFloatsToTensor(
+        _ values: [Float], shape: [Int],
+        dtype: DType, device: Device
+    ) -> Tensor {
+        let t = Tensor.empty(shape: shape, dtype: dtype, device: device)
+        switch dtype {
+        case .f32:
+            t.copyIn(from: values)
+        case .f16:
+            t.copyIn(from: values.map { Float16($0) })
+        case .bf16:
+            t.copyIn(from: values.map { UInt16(truncatingIfNeeded: $0.bitPattern >> 16) })
+        default:
+            fatalError(
+                "AURACodebook.centroidsTensor: unsupported dtype \(dtype); "
+                    + "AURA cache supports f32 / f16 / bf16")
+        }
+        return t
+    }
+
     /// Bytes-per-token after AURA packing at this bit width and dim.
     /// `ceil(dim * bits / 32) * 4` for the packed u32 array, plus 4
     /// bytes for the f32 per-token norm. Excludes any per-vector DC

diff --git a/Sources/FFAI/KVCache/AURAQuantizedKVCache.swift b/Sources/FFAI/KVCache/AURAQuantizedKVCache.swift
@@ -107,16 +107,21 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
     /// Π^T in the activation dtype, used to un-rotate the SDPA output
     /// before `oProj`. Aliases `rotationT` when `dtype == .f32`.
     public let rotationDtypeT: Tensor
-    public let kCodebook: Tensor  // [2^keyBits]   f32
-    public let kBoundaries: Tensor  // [2^keyBits-1] f32
-    public let vCodebook: Tensor  // [2^valueBits] f32
-    public let vBoundaries: Tensor  // [2^valueBits-1] f32
+    /// Codebook in the cache dtype. Encode + decode kernels read
+    /// directly with no per-call cast — the dtype unification landed
+    /// when the single-pass `aura_flash_sdpa` kernel was migrated to
+    /// `Tensor<T>` (matches the production C++ TQ+ fork pattern: fp16-
+    /// stored norms / codebook, f32-at-use via cast-at-load).
+    public let kCodebook: Tensor  // [2^keyBits]   dtype
+    public let kBoundaries: Tensor  // [2^keyBits-1] dtype — Lloyd-Max thresholds (Tensor<T> per metaltile #226)
+    public let vCodebook: Tensor  // [2^valueBits] dtype
+    public let vBoundaries: Tensor  // [2^valueBits-1] dtype
 
     // Per-cache compressed storage.
     public let kPacked: Tensor  // [nKVHeads, maxSeq, kPackedWidth] u32
     public let vPacked: Tensor  // [nKVHeads, maxSeq, vPackedWidth] u32
-    public let kNorms: Tensor  // [nKVHeads, maxSeq] f32
-    public let vNorms: Tensor  // [nKVHeads, maxSeq] f32
+    public let kNorms: Tensor  // [nKVHeads, maxSeq] dtype — encode writes T, decode reads T
+    public let vNorms: Tensor  // [nKVHeads, maxSeq] dtype
 
     // Shared working buffers — bulk-dequant target; reused across layers.
     public let sharedWorkingK: Tensor  // [nKVHeads, maxSeq, headDim] dtype
@@ -192,11 +197,18 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
             "AURAQuantizedKVCache: rotationDtype/rotationDtypeT dtype must match cache dtype \(dtype)"
         )
         precondition(
-            kCodebook.dtype == .f32 && kBoundaries.dtype == .f32,
-            "AURAQuantizedKVCache: K codebook/boundaries must be f32")
+            kCodebook.dtype == dtype,
+            "AURAQuantizedKVCache: K codebook dtype must match cache dtype \(dtype)")
         precondition(
-            vCodebook.dtype == .f32 && vBoundaries.dtype == .f32,
-            "AURAQuantizedKVCache: V codebook/boundaries must be f32")
+            kBoundaries.dtype == dtype,
+            "AURAQuantizedKVCache: K boundaries dtype must match cache dtype \(dtype) "
+                + "— metaltile #226 unified rotation/boundaries to Tensor<T>")
+        precondition(
+            vCodebook.dtype == dtype,
+            "AURAQuantizedKVCache: V codebook dtype must match cache dtype \(dtype)")
+        precondition(
+            vBoundaries.dtype == dtype,
+            "AURAQuantizedKVCache: V boundaries dtype must match cache dtype \(dtype)")
         precondition(
             sharedWorkingK.shape == [nKVHeads, maxSeq, headDim],
             "AURAQuantizedKVCache: sharedWorkingK shape mismatch")
@@ -232,9 +244,9 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
         self.vPacked = Tensor.empty(
             shape: [nKVHeads, maxSeq, vPackedWidth], dtype: .u32, device: device)
         self.kNorms = Tensor.empty(
-            shape: [nKVHeads, maxSeq], dtype: .f32, device: device)
+            shape: [nKVHeads, maxSeq], dtype: dtype, device: device)
         self.vNorms = Tensor.empty(
-            shape: [nKVHeads, maxSeq], dtype: .f32, device: device)
+            shape: [nKVHeads, maxSeq], dtype: dtype, device: device)
 
         // Codec is purely additive in atomic_or terms, so packed slots
         // MUST start zeroed. Norms slots get overwritten per encode but
@@ -377,7 +389,10 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
         let inputBytesPerHead = headDim * dtype.byteSize
         let packedBytesPerSlot = packedWidth * 4  // u32
         let packedBytesPerHead = maxSeq * packedBytesPerSlot
-        let normBytesPerHead = maxSeq * 4  // f32
+        // Norms are stored in the cache dtype post-unification — stride
+        // tracks the activation dtype's byte size, not the legacy 4 (f32).
+        let normByteSize = dtype.byteSize
+        let normBytesPerHead = maxSeq * normByteSize
 
         for h in 0 ..< nKVHeads {
             let inputView = Tensor(
@@ -390,10 +405,16 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
                 shape: [1, packedWidth], dtype: .u32)
             let normsView = Tensor(
                 buffer: norms.buffer,
-                offset: norms.offset + h * normBytesPerHead + pos * 4,
-                shape: [1], dtype: .f32)
+                offset: norms.offset + h * normBytesPerHead + pos * normByteSize,
+                shape: [1], dtype: dtype)
+            // metaltile #226: aura_encode now takes rotation+boundaries
+            // as Tensor<T>. Use the activation-dtype copy of Π
+            // (`rotationDtype`) instead of the legacy f32 `rotation`
+            // field — the f32 field is kept around for any future
+            // kernel that wants f32 rotations, but the encoder no
+            // longer does.
             Ops.auraEncode(
-                input: inputView, rotation: rotation,
+                input: inputView, rotation: rotationDtype,
                 boundaries: boundaries, codebook: codebook,
                 packedOut: packedView, normsOut: normsView,
                 rows: 1, dim: headDim, packedWidth: packedWidth, bits: bits,

diff --git a/Sources/FFAI/KVCache/AURAScheme.swift b/Sources/FFAI/KVCache/AURAScheme.swift
@@ -63,6 +63,61 @@ public struct AURAScheme: Sendable, Equatable, Hashable {
     /// near-baseline quality on tested attention-only models.
     public static let aura4v2 = AURAScheme(keyBits: 4, valueBits: 2)
 
+    /// Production K-protected recipe — 8-bit K + 4-bit V. Matches
+    /// canonical TQ+'s `q8_0-K + turbo4-V` shape; on Qwen3-0.6B-4bit
+    /// the FFAI KLD harness measures mean_kld=0.029 + same-top=89%
+    /// (vs aura4v4's 1.24 / 47%, a 43× quality improvement at 50%
+    /// size cost). The K-side precision is what dominates attention
+    /// quality (softmax exponentiates K-score errors); V can be
+    /// aggressive cheaply.
+    public static let aura8v4 = AURAScheme(keyBits: 8, valueBits: 4)
+
+    /// Sibling of `aura8v4` — 8-bit K + 2-bit V. Tightest size at
+    /// preserved K precision.
+    public static let aura8v2 = AURAScheme(keyBits: 8, valueBits: 2)
+
+    /// Auto-asymmetric-policy resolver. Mirrors canonical TQ+'s
+    /// `TURBO_AUTO_ASYMMETRIC` env behavior: when the model has a
+    /// high GQA fan-out (gqaFactor ≥ 6), shared K rows get
+    /// "amplified" by the softmax across many Q heads — small K
+    /// quantization errors compound across the GQA group. The
+    /// production fix is to keep K at the highest available precision
+    /// (8-bit Lloyd-Max in AURA-land, q8_0 in canonical TQ+).
+    ///
+    /// Behavior:
+    ///   - If `gqaFactor < 6`, return `requested` unchanged.
+    ///   - If `gqaFactor ≥ 6` and `requested.keyBits < 8`, return a
+    ///     scheme with keyBits bumped to 8 (V untouched).
+    ///   - If `gqaFactor ≥ 6` and `requested.keyBits == 8`, return
+    ///     `requested` unchanged (already protected).
+    ///
+    /// Pure resolver — always applies the policy when conditions are
+    /// met. **The policy itself is not opt-in here**; the opt-in lives
+    /// at the call site (model loaders gate this on
+    /// `FFAI_AURA_AUTO_ASYM=1`, and a per-load `LoadOptions` flag will
+    /// replace the env knob in a follow-up). Tests + future API
+    /// callers that want the canonical TQ+ behaviour can invoke this
+    /// directly without env coupling.
+    ///
+    /// Canonical-source mapping: TURBO_AUTO_ASYMMETRIC in
+    /// the reference C++ KV-cache implementation. Threshold = 6
+    /// matches that reference.
+    public static func autoAsymmetric(
+        requested: AURAScheme, gqaFactor: Int
+    ) -> AURAScheme {
+        if gqaFactor < 6 { return requested }
+        if requested.keyBits >= 8 { return requested }
+        return AURAScheme(keyBits: 8, valueBits: requested.valueBits)
+    }
+
+    /// True when the caller has opted into the auto-asymmetric policy
+    /// via `FFAI_AURA_AUTO_ASYM=1`. Read once at module load. Default
+    /// OFF — Eric's "no magic by default" stance: the caller must
+    /// explicitly request the policy.
+    public static let autoAsymmetricOptedIn: Bool = {
+        ProcessInfo.processInfo.environment["FFAI_AURA_AUTO_ASYM"] == "1"
+    }()
+
     /// Parse a CLI / config string. Accepts:
     ///
     /// - `aura` — the stability-first default (aura4v4).

diff --git a/Sources/FFAI/Loader/LoadOptions.swift b/Sources/FFAI/Loader/LoadOptions.swift
@@ -57,25 +57,31 @@ public enum DispatchMode: Sendable {
 /// Only relevant when `LoadOptions.kvCache == .auraQuantized(...)` —
 /// raw / affine caches ignore this setting.
 public enum AURADecodePath: Sendable, Equatable {
-    /// **Default.** Compressed-domain attention via the
-    /// `aura_flash_p1` + `aura_flash_pass2` kernel pair. Q is rotated,
+    /// **Default.** Compressed-domain attention via the 2-pass FA-2
+    /// kernel pair (`aura_flash_p1` + `aura_flash_pass2`) when emitted
+    /// for the (keyBits, valueBits, headDim, dtype) combo, with the
+    /// single-pass `aura_flash_sdpa` as fallback for cells the 2-pass
+    /// kernel hasn't been emitted for. Q is rotated + pre-scaled,
     /// scored directly against the packed K codes (no full-precision
-    /// dequant), then combined with the packed V codes — the kernel
-    /// dequantises per-tile on chip, never materialising a maxSeq-sized
-    /// f16 mirror buffer. Realises AURA's full memory savings (~4× at
-    /// `aura4v2`).
+    /// dequant), and the V codes are dequanted per-tile on chip. The
+    /// `[nKVHeads, maxSeq, headDim]` mirror buffer never materialises,
+    /// realising AURA's memory savings (~1.88× at aura4v4, ~3.7× at
+    /// aura4v2 on Qwen3 d=128).
+    ///
+    /// AURA-dtype unification (metaltile + FFAI joint change) put the
+    /// per-token norms and per-scheme codebook into the activation
+    /// dtype, so encode + both decode kernel paths consume the cache
+    /// buffers directly — no per-call f32 cast on the decode hot path
+    /// and no parallel f32 mirror storage.
     case compressed
 
-    /// Stage 1a behaviour. `prepareForAttention(on:)` dequantises the
-    /// full compressed K/V cache into per-layer shared working buffers
-    /// (`sharedWorkingK` / `sharedWorkingV`, sized
-    /// `[nKVHeads, maxSeq, headDim]`), and the standard
-    /// `Ops.sdpaDecode` reads those. Preserves AURA's quality but
-    /// **gives back the memory savings** — the mirror is the same size
-    /// as a raw fp16 cache. Kept as an opt-in path for A/B benching
-    /// (`compressed` vs `dequantMirror` speed at production shapes)
-    /// and for callers with the memory headroom who want
-    /// matrix-engine SDPA.
+    /// Dequant-mirror path. `prepareForAttention(on:)` materialises
+    /// the full compressed K/V cache into per-layer shared working
+    /// buffers (`sharedWorkingK` / `sharedWorkingV`, sized
+    /// `[nKVHeads, maxSeq, headDim]`) and `Ops.sdpaDecode` reads those.
+    /// Same quality as `.compressed`, **gives back the memory
+    /// savings** — the mirror is the same size as a raw fp16 cache.
+    /// Useful as an A/B baseline against the compressed path.
     case dequantMirror
 }
 
@@ -113,13 +119,13 @@ public struct LoadOptions: Sendable {
     /// entire advertised window, or a smaller value to bound memory.
     public var maxContextLength: Int?
 
-    /// Selects the AURA decode path. Defaults to `.compressed` (Stage
-    /// 1b: attend on packed K/V codes directly via the `aura_flash_*`
-    /// kernel pair — full ~4× memory savings). Set to `.dequantMirror`
-    /// for the Stage 1a path that maintains a full-precision
-    /// `[nKVHeads, maxSeq, headDim]` mirror buffer and runs the
-    /// standard `Ops.sdpaDecode` against it — useful for A/B speed
-    /// benching. Has no effect when `kvCache != .auraQuantized(...)`.
+    /// Selects the AURA decode path. Defaults to `.compressed` — the
+    /// 2-pass FA-2 kernel pair gives token-parallel attention over the
+    /// packed K/V codes directly, with no f16/f32 mirror materialised.
+    /// Set to `.dequantMirror` for an A/B baseline that dequants the
+    /// cache into a per-layer working buffer and runs the standard
+    /// `Ops.sdpaDecode` against it. Has no effect when
+    /// `kvCache != .auraQuantized(...)`.
     public var auraDecodePath: AURADecodePath
 
     public init(

diff --git a/Sources/FFAI/Models/MoELayer.swift b/Sources/FFAI/Models/MoELayer.swift
@@ -532,6 +532,7 @@ public final class MoELayer: Module, DecoderLayer {
         cache _: any LayerCacheProtocol,
         cmd: MTLCommandBuffer, device: Device
     ) -> Tensor {
+        return Profile.signpost("moe.decode") { () -> Tensor in
         precondition(
             h.elementCount == hidden,
             "MoELayer.decode: input has \(h.elementCount) elements, expected hidden \(hidden)")
@@ -631,6 +632,7 @@ public final class MoELayer: Module, DecoderLayer {
         // MoE layer per token (Qwen3.6-A3B = 40 layers).
         work.commit()
         return result
+        }  // Profile.signpost("moe.decode")
     }
 
     /// T-batched MoE forward. `hFlat` is `[T, hidden]` flat; returns

diff --git a/Sources/FFAI/Models/Text/LlamaText.swift b/Sources/FFAI/Models/Text/LlamaText.swift
@@ -553,25 +553,32 @@ public final class LlamaModel: LanguageModel {
                     device: device
                 )
             }
-        case .auraQuantized(let scheme):
+        case .auraQuantized(let requestedScheme):
+            // Auto-asymmetric policy: bump K to 8-bit when GQA ≥ 6.
+            // Mirrors canonical TQ+'s TURBO_AUTO_ASYMMETRIC behavior.
+            // **Opt-in** — default OFF; set `FFAI_AURA_AUTO_ASYM=1` to
+            // enable. A per-load `LoadOptions` flag will replace the
+            // env knob in a follow-up.
+            let gqaFactor = nHeads / max(nKVHeads, 1)
+            let scheme: AURAScheme = AURAScheme.autoAsymmetricOptedIn
+                ? AURAScheme.autoAsymmetric(
+                    requested: requestedScheme, gqaFactor: gqaFactor)
+                : requestedScheme
             // Codebooks are shared across layers; rotations are per-layer
             // (deterministic SRHT seeded by layer index). See Qwen3's
             // matching case for the longer explanation.
-            let kCodebookData = AURACodebook.centroids(dim: headDim, bits: scheme.keyBits)
-            let kBoundariesData = AURACodebook.boundaries(dim: headDim, bits: scheme.keyBits)
-            let vCodebookData = AURACodebook.centroids(dim: headDim, bits: scheme.valueBits)
-            let vBoundariesData = AURACodebook.boundaries(dim: headDim, bits: scheme.valueBits)
-
-            let kCodebook = Tensor.empty(shape: [kCodebookData.count], dtype: .f32, device: device)
-            kCodebook.copyIn(from: kCodebookData)
-            let kBoundaries = Tensor.empty(
-                shape: [kBoundariesData.count], dtype: .f32, device: device)
-            kBoundaries.copyIn(from: kBoundariesData)
-            let vCodebook = Tensor.empty(shape: [vCodebookData.count], dtype: .f32, device: device)
-            vCodebook.copyIn(from: vCodebookData)
-            let vBoundaries = Tensor.empty(
-                shape: [vBoundariesData.count], dtype: .f32, device: device)
-            vBoundaries.copyIn(from: vBoundariesData)
+            // Codebook in cache dtype (matches encode/decode kernel
+            // signatures — no per-call cast). Boundaries stay f32:
+            // encoder-only and precision-sensitive at the Lloyd-Max
+            // comparison.
+            let kCodebook = AURACodebook.centroidsTensor(
+                dim: headDim, bits: scheme.keyBits, dtype: dtype, device: device)
+            let kBoundaries = AURACodebook.boundariesTensor(
+                dim: headDim, bits: scheme.keyBits, dtype: dtype, device: device)
+            let vCodebook = AURACodebook.centroidsTensor(
+                dim: headDim, bits: scheme.valueBits, dtype: dtype, device: device)
+            let vBoundaries = AURACodebook.boundariesTensor(
+                dim: headDim, bits: scheme.valueBits, dtype: dtype, device: device)
 
             let sharedK = Tensor.empty(
                 shape: [nKVHeads, cap, headDim],