Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion Sources/FFAI/KVCache/AURACodebook.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
// the coordinate distribution of unit-sphere vectors converges to a
// near-Gaussian, so a fixed Lloyd-Max table is near-optimal.
//
// The reference values here are mined from llama.cpp's `k_quants`
// The reference values here are mined from the reference C++ `k_quants`
// tables (empirically optimal for unit-norm Gaussian data at d=128)
// and scaled to other head dims by √(128 / dim) — a heuristic that
// approximates the analytic 1/√d Beta-variance scaling from the
Expand Down Expand Up @@ -246,6 +246,56 @@ public enum AURACodebook {
return base.map { $0 * scale }
}

/// Allocate a codebook tensor in the requested activation dtype.
/// AURA cache stores codebook in the same dtype as the model
/// activations so both encode + decode kernels (which take
/// `Tensor<T>` for the codebook) read directly with no per-call
/// cast. The Lloyd-Max values themselves are computed in Float;
/// narrow dtypes (`bf16`/`f16`) round at the CPU-side host conversion.
public static func centroidsTensor(
dim: Int, bits: Int, dtype: DType, device: Device = .shared
) -> Tensor {
let values = centroids(dim: dim, bits: bits)
return writeFloatsToTensor(values, shape: [values.count], dtype: dtype, device: device)
}

/// Allocate a boundaries tensor in the requested activation dtype.
/// Post-metaltile #226, `aura_encode` takes `boundaries: Tensor<T>`
/// — kernel-side bandwidth win (Π + boundaries dominate the encode
/// kernel's memory traffic). Lloyd-Max boundary values are computed
/// in Float; narrow dtypes (bf16/f16) round at the host-side
/// conversion. The bf16/f16 rounding (~1e-3) sits well below the
/// 2-4-bit quant bin so the matched-norm correction stays stable.
public static func boundariesTensor(
dim: Int, bits: Int, dtype: DType, device: Device = .shared
) -> Tensor {
let values = boundaries(dim: dim, bits: bits)
return writeFloatsToTensor(values, shape: [values.count], dtype: dtype, device: device)
}

/// CPU-side host conversion from `[Float]` into a tensor of the
/// requested float dtype. Used by `centroidsTensor` and any caller
/// that needs Lloyd-Max-precise values landed into narrow storage.
private static func writeFloatsToTensor(
_ values: [Float], shape: [Int],
dtype: DType, device: Device
) -> Tensor {
let t = Tensor.empty(shape: shape, dtype: dtype, device: device)
switch dtype {
case .f32:
t.copyIn(from: values)
case .f16:
t.copyIn(from: values.map { Float16($0) })
case .bf16:
t.copyIn(from: values.map { UInt16(truncatingIfNeeded: $0.bitPattern >> 16) })
default:
fatalError(
"AURACodebook.centroidsTensor: unsupported dtype \(dtype); "
+ "AURA cache supports f32 / f16 / bf16")
}
return t
}

/// Bytes-per-token after AURA packing at this bit width and dim.
/// `ceil(dim * bits / 32) * 4` for the packed u32 array, plus 4
/// bytes for the f32 per-token norm. Excludes any per-vector DC
Expand Down
53 changes: 37 additions & 16 deletions Sources/FFAI/KVCache/AURAQuantizedKVCache.swift
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,21 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
/// Π^T in the activation dtype, used to un-rotate the SDPA output
/// before `oProj`. Aliases `rotationT` when `dtype == .f32`.
public let rotationDtypeT: Tensor
public let kCodebook: Tensor // [2^keyBits] f32
public let kBoundaries: Tensor // [2^keyBits-1] f32
public let vCodebook: Tensor // [2^valueBits] f32
public let vBoundaries: Tensor // [2^valueBits-1] f32
/// Codebook in the cache dtype. Encode + decode kernels read
/// directly with no per-call cast — the dtype unification landed
/// when the single-pass `aura_flash_sdpa` kernel was migrated to
/// `Tensor<T>` (matches the production C++ TQ+ fork pattern: fp16-
/// stored norms / codebook, f32-at-use via cast-at-load).
public let kCodebook: Tensor // [2^keyBits] dtype
public let kBoundaries: Tensor // [2^keyBits-1] dtype — Lloyd-Max thresholds (Tensor<T> per metaltile #226)
public let vCodebook: Tensor // [2^valueBits] dtype
public let vBoundaries: Tensor // [2^valueBits-1] dtype

// Per-cache compressed storage.
public let kPacked: Tensor // [nKVHeads, maxSeq, kPackedWidth] u32
public let vPacked: Tensor // [nKVHeads, maxSeq, vPackedWidth] u32
public let kNorms: Tensor // [nKVHeads, maxSeq] f32
public let vNorms: Tensor // [nKVHeads, maxSeq] f32
public let kNorms: Tensor // [nKVHeads, maxSeq] dtype — encode writes T, decode reads T
public let vNorms: Tensor // [nKVHeads, maxSeq] dtype

// Shared working buffers — bulk-dequant target; reused across layers.
public let sharedWorkingK: Tensor // [nKVHeads, maxSeq, headDim] dtype
Expand Down Expand Up @@ -192,11 +197,18 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
"AURAQuantizedKVCache: rotationDtype/rotationDtypeT dtype must match cache dtype \(dtype)"
)
precondition(
kCodebook.dtype == .f32 && kBoundaries.dtype == .f32,
"AURAQuantizedKVCache: K codebook/boundaries must be f32")
kCodebook.dtype == dtype,
"AURAQuantizedKVCache: K codebook dtype must match cache dtype \(dtype)")
precondition(
vCodebook.dtype == .f32 && vBoundaries.dtype == .f32,
"AURAQuantizedKVCache: V codebook/boundaries must be f32")
kBoundaries.dtype == dtype,
"AURAQuantizedKVCache: K boundaries dtype must match cache dtype \(dtype) "
+ "— metaltile #226 unified rotation/boundaries to Tensor<T>")
precondition(
vCodebook.dtype == dtype,
"AURAQuantizedKVCache: V codebook dtype must match cache dtype \(dtype)")
precondition(
vBoundaries.dtype == dtype,
"AURAQuantizedKVCache: V boundaries dtype must match cache dtype \(dtype)")
precondition(
sharedWorkingK.shape == [nKVHeads, maxSeq, headDim],
"AURAQuantizedKVCache: sharedWorkingK shape mismatch")
Expand Down Expand Up @@ -232,9 +244,9 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
self.vPacked = Tensor.empty(
shape: [nKVHeads, maxSeq, vPackedWidth], dtype: .u32, device: device)
self.kNorms = Tensor.empty(
shape: [nKVHeads, maxSeq], dtype: .f32, device: device)
shape: [nKVHeads, maxSeq], dtype: dtype, device: device)
self.vNorms = Tensor.empty(
shape: [nKVHeads, maxSeq], dtype: .f32, device: device)
shape: [nKVHeads, maxSeq], dtype: dtype, device: device)

// Codec is purely additive in atomic_or terms, so packed slots
// MUST start zeroed. Norms slots get overwritten per encode but
Expand Down Expand Up @@ -377,7 +389,10 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
let inputBytesPerHead = headDim * dtype.byteSize
let packedBytesPerSlot = packedWidth * 4 // u32
let packedBytesPerHead = maxSeq * packedBytesPerSlot
let normBytesPerHead = maxSeq * 4 // f32
// Norms are stored in the cache dtype post-unification — stride
// tracks the activation dtype's byte size, not the legacy 4 (f32).
let normByteSize = dtype.byteSize
let normBytesPerHead = maxSeq * normByteSize

for h in 0 ..< nKVHeads {
let inputView = Tensor(
Expand All @@ -390,10 +405,16 @@ public final class AURAQuantizedKVCache: KVCacheProtocol, @unchecked Sendable {
shape: [1, packedWidth], dtype: .u32)
let normsView = Tensor(
buffer: norms.buffer,
offset: norms.offset + h * normBytesPerHead + pos * 4,
shape: [1], dtype: .f32)
offset: norms.offset + h * normBytesPerHead + pos * normByteSize,
shape: [1], dtype: dtype)
// metaltile #226: aura_encode now takes rotation+boundaries
// as Tensor<T>. Use the activation-dtype copy of Π
// (`rotationDtype`) instead of the legacy f32 `rotation`
// field — the f32 field is kept around for any future
// kernel that wants f32 rotations, but the encoder no
// longer does.
Ops.auraEncode(
input: inputView, rotation: rotation,
input: inputView, rotation: rotationDtype,
boundaries: boundaries, codebook: codebook,
packedOut: packedView, normsOut: normsView,
rows: 1, dim: headDim, packedWidth: packedWidth, bits: bits,
Expand Down
55 changes: 55 additions & 0 deletions Sources/FFAI/KVCache/AURAScheme.swift
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,61 @@ public struct AURAScheme: Sendable, Equatable, Hashable {
/// near-baseline quality on tested attention-only models.
public static let aura4v2 = AURAScheme(keyBits: 4, valueBits: 2)

/// Production K-protected recipe — 8-bit K + 4-bit V. Matches
/// canonical TQ+'s `q8_0-K + turbo4-V` shape; on Qwen3-0.6B-4bit
/// the FFAI KLD harness measures mean_kld=0.029 + same-top=89%
/// (vs aura4v4's 1.24 / 47%, a 43× quality improvement at 50%
/// size cost). The K-side precision is what dominates attention
/// quality (softmax exponentiates K-score errors); V can be
/// aggressive cheaply.
public static let aura8v4 = AURAScheme(keyBits: 8, valueBits: 4)

/// Sibling of `aura8v4` — 8-bit K + 2-bit V. Tightest size at
/// preserved K precision.
public static let aura8v2 = AURAScheme(keyBits: 8, valueBits: 2)

/// Auto-asymmetric-policy resolver. Mirrors canonical TQ+'s
/// `TURBO_AUTO_ASYMMETRIC` env behavior: when the model has a
/// high GQA fan-out (gqaFactor ≥ 6), shared K rows get
/// "amplified" by the softmax across many Q heads — small K
/// quantization errors compound across the GQA group. The
/// production fix is to keep K at the highest available precision
/// (8-bit Lloyd-Max in AURA-land, q8_0 in canonical TQ+).
///
/// Behavior:
/// - If `gqaFactor < 6`, return `requested` unchanged.
/// - If `gqaFactor ≥ 6` and `requested.keyBits < 8`, return a
/// scheme with keyBits bumped to 8 (V untouched).
/// - If `gqaFactor ≥ 6` and `requested.keyBits == 8`, return
/// `requested` unchanged (already protected).
///
/// Pure resolver — always applies the policy when conditions are
/// met. **The policy itself is not opt-in here**; the opt-in lives
/// at the call site (model loaders gate this on
/// `FFAI_AURA_AUTO_ASYM=1`, and a per-load `LoadOptions` flag will
/// replace the env knob in a follow-up). Tests + future API
/// callers that want the canonical TQ+ behaviour can invoke this
/// directly without env coupling.
///
/// Canonical-source mapping: TURBO_AUTO_ASYMMETRIC in
/// the reference C++ KV-cache implementation. Threshold = 6
/// matches that reference.
public static func autoAsymmetric(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should do auto asymmetric by default. I think the caller should explicitly declare what they want and opt-in to automatic switching.

My stance in general is make things clear what they do. No/minimal magic. If you want magic, opt-in because you know how it works.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done (66a1238) — auto-asymmetric is opt-in: default OFF, requires FFAI_AURA_AUTO_ASYM=1 (gated by AURAScheme.autoAsymmetricOptedIn; the autoAsymmetric(...) policy fn only runs when the caller opts in). No magic by default, per your stance.

requested: AURAScheme, gqaFactor: Int
) -> AURAScheme {
if gqaFactor < 6 { return requested }
if requested.keyBits >= 8 { return requested }
return AURAScheme(keyBits: 8, valueBits: requested.valueBits)
}

/// True when the caller has opted into the auto-asymmetric policy
/// via `FFAI_AURA_AUTO_ASYM=1`. Read once at module load. Default
/// OFF — Eric's "no magic by default" stance: the caller must
/// explicitly request the policy.
public static let autoAsymmetricOptedIn: Bool = {
ProcessInfo.processInfo.environment["FFAI_AURA_AUTO_ASYM"] == "1"
}()

/// Parse a CLI / config string. Accepts:
///
/// - `aura` — the stability-first default (aura4v4).
Expand Down
52 changes: 29 additions & 23 deletions Sources/FFAI/Loader/LoadOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -57,25 +57,31 @@ public enum DispatchMode: Sendable {
/// Only relevant when `LoadOptions.kvCache == .auraQuantized(...)` —
/// raw / affine caches ignore this setting.
public enum AURADecodePath: Sendable, Equatable {
/// **Default.** Compressed-domain attention via the
/// `aura_flash_p1` + `aura_flash_pass2` kernel pair. Q is rotated,
/// **Default.** Compressed-domain attention via the 2-pass FA-2
/// kernel pair (`aura_flash_p1` + `aura_flash_pass2`) when emitted
/// for the (keyBits, valueBits, headDim, dtype) combo, with the
/// single-pass `aura_flash_sdpa` as fallback for cells the 2-pass
/// kernel hasn't been emitted for. Q is rotated + pre-scaled,
/// scored directly against the packed K codes (no full-precision
/// dequant), then combined with the packed V codes — the kernel
/// dequantises per-tile on chip, never materialising a maxSeq-sized
/// f16 mirror buffer. Realises AURA's full memory savings (~4× at
/// `aura4v2`).
/// dequant), and the V codes are dequanted per-tile on chip. The
/// `[nKVHeads, maxSeq, headDim]` mirror buffer never materialises,
/// realising AURA's memory savings (~1.88× at aura4v4, ~3.7× at
/// aura4v2 on Qwen3 d=128).
///
/// AURA-dtype unification (metaltile + FFAI joint change) put the
/// per-token norms and per-scheme codebook into the activation
/// dtype, so encode + both decode kernel paths consume the cache
/// buffers directly — no per-call f32 cast on the decode hot path
/// and no parallel f32 mirror storage.
case compressed

/// Stage 1a behaviour. `prepareForAttention(on:)` dequantises the
/// full compressed K/V cache into per-layer shared working buffers
/// (`sharedWorkingK` / `sharedWorkingV`, sized
/// `[nKVHeads, maxSeq, headDim]`), and the standard
/// `Ops.sdpaDecode` reads those. Preserves AURA's quality but
/// **gives back the memory savings** — the mirror is the same size
/// as a raw fp16 cache. Kept as an opt-in path for A/B benching
/// (`compressed` vs `dequantMirror` speed at production shapes)
/// and for callers with the memory headroom who want
/// matrix-engine SDPA.
/// Dequant-mirror path. `prepareForAttention(on:)` materialises
/// the full compressed K/V cache into per-layer shared working
/// buffers (`sharedWorkingK` / `sharedWorkingV`, sized
/// `[nKVHeads, maxSeq, headDim]`) and `Ops.sdpaDecode` reads those.
/// Same quality as `.compressed`, **gives back the memory
/// savings** — the mirror is the same size as a raw fp16 cache.
/// Useful as an A/B baseline against the compressed path.
case dequantMirror
}

Expand Down Expand Up @@ -113,13 +119,13 @@ public struct LoadOptions: Sendable {
/// entire advertised window, or a smaller value to bound memory.
public var maxContextLength: Int?

/// Selects the AURA decode path. Defaults to `.compressed` (Stage
/// 1b: attend on packed K/V codes directly via the `aura_flash_*`
/// kernel pair — full ~4× memory savings). Set to `.dequantMirror`
/// for the Stage 1a path that maintains a full-precision
/// `[nKVHeads, maxSeq, headDim]` mirror buffer and runs the
/// standard `Ops.sdpaDecode` against it — useful for A/B speed
/// benching. Has no effect when `kvCache != .auraQuantized(...)`.
/// Selects the AURA decode path. Defaults to `.compressed` — the
/// 2-pass FA-2 kernel pair gives token-parallel attention over the
/// packed K/V codes directly, with no f16/f32 mirror materialised.
/// Set to `.dequantMirror` for an A/B baseline that dequants the
/// cache into a per-layer working buffer and runs the standard
/// `Ops.sdpaDecode` against it. Has no effect when
/// `kvCache != .auraQuantized(...)`.
public var auraDecodePath: AURADecodePath

public init(
Expand Down
2 changes: 2 additions & 0 deletions Sources/FFAI/Models/MoELayer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ public final class MoELayer: Module, DecoderLayer {
cache _: any LayerCacheProtocol,
cmd: MTLCommandBuffer, device: Device
) -> Tensor {
return Profile.signpost("moe.decode") { () -> Tensor in
precondition(
h.elementCount == hidden,
"MoELayer.decode: input has \(h.elementCount) elements, expected hidden \(hidden)")
Expand Down Expand Up @@ -631,6 +632,7 @@ public final class MoELayer: Module, DecoderLayer {
// MoE layer per token (Qwen3.6-A3B = 40 layers).
work.commit()
return result
} // Profile.signpost("moe.decode")
}

/// T-batched MoE forward. `hFlat` is `[T, hidden]` flat; returns
Expand Down
39 changes: 23 additions & 16 deletions Sources/FFAI/Models/Text/LlamaText.swift
Original file line number Diff line number Diff line change
Expand Up @@ -553,25 +553,32 @@ public final class LlamaModel: LanguageModel {
device: device
)
}
case .auraQuantized(let scheme):
case .auraQuantized(let requestedScheme):
// Auto-asymmetric policy: bump K to 8-bit when GQA ≥ 6.
// Mirrors canonical TQ+'s TURBO_AUTO_ASYMMETRIC behavior.
// **Opt-in** — default OFF; set `FFAI_AURA_AUTO_ASYM=1` to
// enable. A per-load `LoadOptions` flag will replace the
// env knob in a follow-up.
let gqaFactor = nHeads / max(nKVHeads, 1)
let scheme: AURAScheme = AURAScheme.autoAsymmetricOptedIn
? AURAScheme.autoAsymmetric(
requested: requestedScheme, gqaFactor: gqaFactor)
: requestedScheme
// Codebooks are shared across layers; rotations are per-layer
// (deterministic SRHT seeded by layer index). See Qwen3's
// matching case for the longer explanation.
let kCodebookData = AURACodebook.centroids(dim: headDim, bits: scheme.keyBits)
let kBoundariesData = AURACodebook.boundaries(dim: headDim, bits: scheme.keyBits)
let vCodebookData = AURACodebook.centroids(dim: headDim, bits: scheme.valueBits)
let vBoundariesData = AURACodebook.boundaries(dim: headDim, bits: scheme.valueBits)

let kCodebook = Tensor.empty(shape: [kCodebookData.count], dtype: .f32, device: device)
kCodebook.copyIn(from: kCodebookData)
let kBoundaries = Tensor.empty(
shape: [kBoundariesData.count], dtype: .f32, device: device)
kBoundaries.copyIn(from: kBoundariesData)
let vCodebook = Tensor.empty(shape: [vCodebookData.count], dtype: .f32, device: device)
vCodebook.copyIn(from: vCodebookData)
let vBoundaries = Tensor.empty(
shape: [vBoundariesData.count], dtype: .f32, device: device)
vBoundaries.copyIn(from: vBoundariesData)
// Codebook in cache dtype (matches encode/decode kernel
// signatures — no per-call cast). Boundaries stay f32:
// encoder-only and precision-sensitive at the Lloyd-Max
// comparison.
let kCodebook = AURACodebook.centroidsTensor(
dim: headDim, bits: scheme.keyBits, dtype: dtype, device: device)
let kBoundaries = AURACodebook.boundariesTensor(
dim: headDim, bits: scheme.keyBits, dtype: dtype, device: device)
let vCodebook = AURACodebook.centroidsTensor(
dim: headDim, bits: scheme.valueBits, dtype: dtype, device: device)
let vBoundaries = AURACodebook.boundariesTensor(
dim: headDim, bits: scheme.valueBits, dtype: dtype, device: device)

let sharedK = Tensor.empty(
shape: [nKVHeads, cap, headDim],
Expand Down
Loading