From 6eb7d4cf829bb224c12e62a5ea7747a1cf683821 Mon Sep 17 00:00:00 2001
From: Ken Ahrens <ken.ahrens@gmail.com>
Date: Mon, 25 May 2026 20:40:22 -0400
Subject: [PATCH] speed-bench: add M5 Max 128GB q2-q4-imatrix curve

Bench data for the q2-q4-imatrix mixed Flash quant (last 6 expert
layers Q4K, rest IQ2XXS) on M5 Max 128GB, macOS 26.4.1.

Fills the unanswered request in #226 for q2-q4-imatrix benchmark
numbers, and extends published M5 Max coverage past the 65K point
from #97 into the 100K-200K range.

Command: ds4-bench -m ds4flash.gguf --prompt-file
speed-bench/promessi_sposi.txt --ctx-start 2048 --ctx-max 200000
--step-incr 16384 --gen-tokens 128

Build: ad0209f (Metal 4 tensor API + decode-indexer top-k path
from #169 enabled).

Highlights vs M5 Max q2-imatrix from #97 (same hardware tier):
- 2K decode: 34.4 t/s (vs 31.5 t/s, +9%)
- 2K prefill: 413.9 t/s (vs 372.2 t/s, +11%)
- 32K decode: 27.8 t/s (vs 28.9 t/s, -4%)
- 65K decode: 25.8 t/s (vs 27.0 t/s, -4%)

q2-q4 is faster than q2 at low ctx (Q4 layers + Metal 4 win) and
~4% slower above 32K (more bandwidth-bound). Closes #226 with data.
---
 speed-bench/m5_max_q2q4_imatrix.csv    | 15 ++++++++
 speed-bench/m5_max_q2q4_imatrix_ts.svg | 52 ++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 speed-bench/m5_max_q2q4_imatrix.csv
 create mode 100644 speed-bench/m5_max_q2q4_imatrix_ts.svg
diff --git a/speed-bench/m5_max_q2q4_imatrix.csv b/speed-bench/m5_max_q2q4_imatrix.csv
new file mode 100644
index 000000000..b84bf647a
--- /dev/null
+++ b/speed-bench/m5_max_q2q4_imatrix.csv
@@ -0,0 +1,15 @@
+ctx_tokens,prefill_tokens,prefill_tps,gen_tokens,gen_tps,kvcache_bytes
+2048,2048,413.85,128,34.42,52184460
+18432,16384,405.31,128,28.42,277693836
+34816,16384,374.49,128,27.75,503203212
+51200,16384,333.84,128,26.79,728712588
+67584,16384,298.66,128,25.75,954221964
+83968,16384,269.69,128,25.43,1179731340
+100352,16384,248.99,128,24.36,1405240716
+116736,16384,230.49,128,23.63,1630750092
+133120,16384,215.12,128,22.37,1856259468
+149504,16384,198.15,128,21.70,2081768844
+165888,16384,187.32,128,20.72,2307278220
+182272,16384,176.49,128,20.16,2532787596
+198656,16384,165.14,128,19.54,2758296972
+200000,1344,157.02,128,19.37,2776775308
diff --git a/speed-bench/m5_max_q2q4_imatrix_ts.svg b/speed-bench/m5_max_q2q4_imatrix_ts.svg
new file mode 100644
index 000000000..219dbafa0
--- /dev/null
+++ b/speed-bench/m5_max_q2q4_imatrix_ts.svg
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="960" height="540" viewBox="0 0 960 540">
+<style>
+text { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; }
+.title { font-size: 26px; font-weight: 700; fill: #1f2933; }
+.axis-label { font-size: 14px; font-weight: 600; fill: #334155; }
+.tick { font-size: 12px; fill: #64748b; }
+.legend { font-size: 13px; font-weight: 600; fill: #1f2933; }
+</style>
+<rect width="960" height="540" fill="#ffffff"/>
+<text class="title" x="480.0" y="34" text-anchor="middle">M5 Max (128GB) q2-q4-imatrix t/s</text>
+<line x1="82" y1="468.00" x2="878" y2="468.00" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="70" y="472.00" text-anchor="end">0</text>
+<line x1="82" y1="387.60" x2="878" y2="387.60" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="70" y="391.60" text-anchor="end">100</text>
+<line x1="82" y1="307.20" x2="878" y2="307.20" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="70" y="311.20" text-anchor="end">200</text>
+<line x1="82" y1="226.80" x2="878" y2="226.80" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="70" y="230.80" text-anchor="end">300</text>
+<line x1="82" y1="146.40" x2="878" y2="146.40" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="70" y="150.40" text-anchor="end">400</text>
+<line x1="82" y1="66.00" x2="878" y2="66.00" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="70" y="70.00" text-anchor="end">500</text>
+<text class="tick" x="890" y="472.00" text-anchor="start">0</text>
+<text class="tick" x="890" y="371.50" text-anchor="start">10</text>
+<text class="tick" x="890" y="271.00" text-anchor="start">20</text>
+<text class="tick" x="890" y="170.50" text-anchor="start">30</text>
+<text class="tick" x="890" y="70.00" text-anchor="start">40</text>
+<line x1="82.00" y1="66" x2="82.00" y2="468" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="82.00" y="492" text-anchor="middle">0</text>
+<line x1="281.00" y1="66" x2="281.00" y2="468" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="281.00" y="492" text-anchor="middle">50k</text>
+<line x1="480.00" y1="66" x2="480.00" y2="468" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="480.00" y="492" text-anchor="middle">100k</text>
+<line x1="679.00" y1="66" x2="679.00" y2="468" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="679.00" y="492" text-anchor="middle">150k</text>
+<line x1="878.00" y1="66" x2="878.00" y2="468" stroke="#e2e8f0" stroke-width="1"/>
+<text class="tick" x="878.00" y="492" text-anchor="middle">200k</text>
+<line x1="82" y1="66" x2="82" y2="468" stroke="#334155" stroke-width="1.4"/>
+<line x1="878" y1="66" x2="878" y2="468" stroke="#334155" stroke-width="1.4"/>
+<line x1="82" y1="468" x2="878" y2="468" stroke="#334155" stroke-width="1.4"/>
+<text class="axis-label" x="480.0" y="520" text-anchor="middle">ctx size</text>
+<text class="axis-label" x="22" y="267.0" text-anchor="middle" transform="rotate(-90 22 267.0)">prefill t/s</text>
+<text class="axis-label" x="938" y="267.0" text-anchor="middle" transform="rotate(90 938 267.0)">generation t/s</text>
+<polyline fill="none" stroke="#2563eb" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" points="90.15,135.26 155.36,142.13 220.57,166.91 285.78,199.59 350.98,227.88 416.19,251.17 481.40,267.81 546.61,282.69 611.82,295.04 677.03,308.69 742.23,317.39 807.44,326.10 872.65,335.23 878.00,341.76"/>
+<polyline fill="none" stroke="#dc2626" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" points="90.15,122.08 155.36,182.38 220.57,189.11 285.78,198.76 350.98,209.21 416.19,212.43 481.40,223.18 546.61,230.52 611.82,243.18 677.03,249.92 742.23,259.76 807.44,265.39 872.65,271.62 878.00,273.33"/>
+<rect x="694" y="66" width="176" height="62" rx="6" fill="#ffffff" stroke="#cbd5e1"/>
+<rect x="708" y="78" width="12" height="12" fill="#2563eb"/>
+<text class="legend" x="730" y="89">prefill</text>
+<rect x="708" y="104" width="12" height="12" fill="#dc2626"/>
+<text class="legend" x="730" y="115">generation</text>
+</svg>