diff --git a/ds4.c b/ds4.c index 21573fb81..4c68ad210 100644 --- a/ds4.c +++ b/ds4.c @@ -8545,6 +8545,17 @@ typedef struct { ds4_gpu_tensor *layer_attn_state_kv[DS4_MAX_LAYER]; ds4_gpu_tensor *layer_attn_state_score[DS4_MAX_LAYER]; ds4_gpu_tensor *layer_index_comp_cache[DS4_MAX_LAYER]; + /* HISA: per-layer block representatives (mean-pool of HISA_BLOCK_SIZE + * consecutive index_comp rows). Sized n_blocks_max x 128 floats per + * layer, where n_blocks_max = ceil(layer_comp_cap / 128). Roughly + * 256 KB per layer at 256K ctx cap, allocated alongside the comp + * cache so the decode dispatch can switch into HISA whenever + * n_index_comp crosses the gate. */ + ds4_gpu_tensor *layer_hisa_block_reps[DS4_MAX_LAYER]; + /* HISA scratch: top-m block indices and block-scores, shared across + * layers (sized once for the largest layer's block count). */ + ds4_gpu_tensor *hisa_sel_blocks; + ds4_gpu_tensor *hisa_block_scores; ds4_gpu_tensor *layer_index_state_kv[DS4_MAX_LAYER]; ds4_gpu_tensor *layer_index_state_score[DS4_MAX_LAYER]; @@ -8825,6 +8836,11 @@ static void metal_graph_free(ds4_gpu_graph *g) { for (uint32_t il = 0; il < DS4_N_LAYER; il++) { ds4_gpu_tensor_free(g->layer_index_comp_cache[il]); } + for (uint32_t il = 0; il < DS4_N_LAYER; il++) { + ds4_gpu_tensor_free(g->layer_hisa_block_reps[il]); + } + ds4_gpu_tensor_free(g->hisa_sel_blocks); + ds4_gpu_tensor_free(g->hisa_block_scores); for (uint32_t il = 0; il < DS4_N_LAYER; il++) { ds4_gpu_tensor_free(g->layer_index_state_kv[il]); } @@ -9279,6 +9295,28 @@ static bool metal_graph_alloc_raw_cap( g->layer_index_comp_cache[il] = metal_graph_alloc_kv_cache_tensor( managed_kv_cache, (uint64_t)g->layer_comp_cap[il] * DS4_N_INDEXER_HEAD_DIM * sizeof(float)); + /* HISA scratch: per-layer block reps plus the shared + * selection and score scratches. n_blocks_max = + * ceil(layer_comp_cap / 128). Small enough at any + * sensible ctx (~256 KB per layer at 256K cap) that we + * always allocate; the decode dispatch decides whether + * to actually use HISA based on n_index_comp at runtime. */ + { + const uint32_t n_blocks_max = + (g->layer_comp_cap[il] + 127u) / 128u; + if (n_blocks_max > 0u) { + g->layer_hisa_block_reps[il] = ds4_gpu_tensor_alloc( + (uint64_t)n_blocks_max * DS4_N_INDEXER_HEAD_DIM * sizeof(float)); + if (!g->hisa_sel_blocks) { + g->hisa_sel_blocks = ds4_gpu_tensor_alloc( + (uint64_t)128u * sizeof(uint32_t)); + } + if (!g->hisa_block_scores) { + g->hisa_block_scores = ds4_gpu_tensor_alloc( + (uint64_t)n_blocks_max * sizeof(float)); + } + } + } g->layer_index_state_kv[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float)); g->layer_index_state_score[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float)); if (enable_mtp) { @@ -10200,14 +10238,55 @@ static bool metal_graph_encode_decode_layer( g->layer_n_index_comp[il], &decode_index_stage_t0); } - if (ok) ok = ds4_gpu_indexer_score_one_tensor(g->indexer_scores, + /* HISA hierarchical indexer dispatch. The flat indexer + * walks every compressed row; once that row count is + * large enough, swapping in HISA's coarse + refine pair + * is cheaper. A zero return from the launcher (missing + * allocation, bad arguments) falls through to the flat + * path so the existing behavior is the safe default. + * + * Gate threshold: 49152 rows is roughly 196K ctx at + * ratio 4. Below that the block-rep rebuild plus the + * top-m selection cost exceeds the refine savings, so + * we keep using the flat indexer. Top-m = 64 was the + * smallest value that preserved >99% top-K IoU vs flat + * across every KV dtype tested. Block size matches + * the kernel-side DS4_CUDA_HISA_BLOCK_SIZE constant. */ + if (ok) { + const uint32_t n_index_comp = g->layer_n_index_comp[il]; + bool used_hisa = false; + if (g->layer_hisa_block_reps[il] != NULL && + g->hisa_sel_blocks != NULL && + g->hisa_block_scores != NULL && + n_index_comp >= 49152u) { + const uint32_t n_blocks = (n_index_comp + 127u) / 128u; + if (ds4_gpu_hisa_score_one_tensor(g->indexer_scores, + g->hisa_sel_blocks, + g->hisa_block_scores, + g->indexer_q, + g->indexer_weights, + g->layer_hisa_block_reps[il], + g->layer_index_comp_cache[il], + n_index_comp, + n_blocks, + n_blocks, + n_index_comp, + 64u, + index_scale) != 0) { + used_hisa = true; + } + } + if (!used_hisa) { + ok = ds4_gpu_indexer_score_one_tensor(g->indexer_scores, g->indexer_q, g->indexer_weights, g->layer_index_comp_cache[il], - g->layer_n_index_comp[il], + n_index_comp, DS4_N_INDEXER_HEAD, DS4_N_INDEXER_HEAD_DIM, index_scale) != 0; + } + } if (ok && decode_index_stage_profile) { ok = metal_graph_indexer_stage_profile_boundary("decode_score", il, diff --git a/ds4_cuda.cu b/ds4_cuda.cu index dac9276e0..a09fb127b 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -32,7 +32,18 @@ enum { * becomes an out-of-bounds write at long context. */ DS4_CUDA_ATTENTION_SCORE_CAP = 8192u, DS4_CUDA_ATTENTION_RAW_SCORE_CAP = 256u, - DS4_CUDA_TOPK_MERGE_GROUP = 8u + DS4_CUDA_TOPK_MERGE_GROUP = 8u, + + /* HISA hierarchical indexer. BLOCK_SIZE picks the granularity of + * the coarse pass (rows per block representative); 128 keeps the + * stage-1 dot in one warp-cooperative pass and matches the paper. + * HEAD_DIM equals the indexer head dimension so the block rep is + * laid out identically to one index_comp row. The host-side gate + * (n_index_comp threshold) and top-m count live with the dispatch + * in ds4.c since those are tuning knobs rather than kernel + * invariants. */ + DS4_CUDA_HISA_BLOCK_SIZE = 128u, + DS4_CUDA_HISA_HEAD_DIM = 128u }; struct ds4_gpu_tensor { @@ -4476,6 +4487,200 @@ __global__ static void zero_kernel(float *out, uint64_t n) { if (i < n) out[i] = 0.0f; } +/* HISA hierarchical indexer (arxiv 2603.28458). + * + * The flat indexer scores every compressed row at decode-token, so the + * scan cost grows linearly with context. HISA cuts that to two stages: + * a coarse pass scores block representatives (mean of BLOCK_SIZE rows), + * a top-m kernel picks the best blocks, and the refine pass scores + * individual tokens only inside those blocks. The math inside each + * per-row dot is identical to `indexer_score_one_direct_kernel` (per- + * head ReLU dot weighted by the learned per-head scalar), so the paper's + * >99% top-K IoU guarantee carries over with no weight changes. + * + * The block_reps tensor is laid out [n_blocks, HEAD_DIM] float, one row + * per block; n_blocks = ceil(layer_comp_cap / BLOCK_SIZE). v1 rebuilds + * all reps on every dispatch. Rebuild cost is small at 256K (~512 CTAs + * doing one mean each) compared to the refine savings, but an + * incremental update covering only the last partial block on each + * compressor emit is a natural follow-up. */ + +/* Mean-pool BLOCK_SIZE consecutive rows of `index_comp` into one row of + * `block_reps`. One CTA per block, one thread per output dimension; the + * loop body is the same arithmetic the flat indexer used to do inline + * for every row, just amortized across BLOCK_SIZE rows up front. */ +__global__ static void hisa_block_rep_update_kernel( + float *block_reps, + const float *index_comp, + uint32_t n_comp, + uint32_t n_blocks) { + const uint32_t block_id = blockIdx.x; + const uint32_t d = threadIdx.x; + if (block_id >= n_blocks || d >= DS4_CUDA_HISA_HEAD_DIM) return; + const uint32_t row_start = block_id * DS4_CUDA_HISA_BLOCK_SIZE; + if (row_start >= n_comp) { + block_reps[(uint64_t)block_id * DS4_CUDA_HISA_HEAD_DIM + d] = 0.0f; + return; + } + uint32_t row_end = row_start + DS4_CUDA_HISA_BLOCK_SIZE; + if (row_end > n_comp) row_end = n_comp; + const uint32_t count = row_end - row_start; + float sum = 0.0f; + for (uint32_t r = row_start; r < row_end; r++) { + sum += index_comp[(uint64_t)r * DS4_CUDA_HISA_HEAD_DIM + d]; + } + block_reps[(uint64_t)block_id * DS4_CUDA_HISA_HEAD_DIM + d] = sum / (float)count; +} + +/* Stage 1: score block representatives. Same per-head ReLU dot-product + * as `indexer_score_one_direct_kernel` but over n_blocks << n_comp. + * Grid: <<>>. Decode-token only (n_tokens==1). */ +__global__ static void hisa_block_scores_kernel( + float *block_scores, + const float *q, + const float *weights, + const float *block_reps, + uint32_t n_blocks, + uint32_t n_visible_blocks, + float scale) { + const uint32_t b = blockIdx.x; + const uint32_t tid = threadIdx.x; + const uint32_t lane = tid & 31u; + const uint32_t warp = tid >> 5u; + if (b >= n_blocks || tid >= 128u) return; + if (b >= n_visible_blocks) { + if (tid == 0) block_scores[b] = -INFINITY; + return; + } + + __shared__ float krow[128]; + __shared__ float partial[4]; + if (tid < 128u) krow[tid] = block_reps[(uint64_t)b * 128u + tid]; + __syncthreads(); + + float total = 0.0f; + for (uint32_t h0 = 0; h0 < 64u; h0 += 4u) { + const uint32_t h = h0 + warp; + const float4 qv = ((const float4 *)(q + (uint64_t)h * 128u))[lane]; + const float4 kv = ((const float4 *)krow)[lane]; + float dot = qv.x * kv.x + qv.y * kv.y + qv.z * kv.z + qv.w * kv.w; + dot = warp_sum_f32(dot); + if (lane == 0) partial[warp] = fmaxf(dot, 0.0f) * weights[h] * scale; + __syncthreads(); + if (tid == 0) total += partial[0] + partial[1] + partial[2] + partial[3]; + __syncthreads(); + } + if (tid == 0) block_scores[b] = total; +} + +/* Top-m block selection. Single CTA partial selection-sort across + * n_blocks <= 1024 floats. Force-includes block 0 and the most recent + * visible block (n_visible_blocks - 1) per the HISA recency rule. + * Writes block ids to `sel_blocks[0..m)`. */ +__global__ static void hisa_block_topm_kernel( + uint32_t *sel_blocks, + const float *block_scores, + uint32_t n_blocks, + uint32_t n_visible_blocks, + uint32_t m) { + extern __shared__ float shm[]; + float *sscore = shm; + uint32_t *sidx = (uint32_t *)(shm + n_blocks); + const uint32_t tid = threadIdx.x; + for (uint32_t i = tid; i < n_blocks; i += blockDim.x) { + sscore[i] = (i < n_visible_blocks) ? block_scores[i] : -INFINITY; + sidx[i] = i; + } + __syncthreads(); + if (tid == 0) { + for (uint32_t pick = 0; pick < m; pick++) { + uint32_t best = pick; + float best_v = sscore[pick]; + for (uint32_t i = pick + 1; i < n_blocks; i++) { + if (sscore[i] > best_v) { best_v = sscore[i]; best = i; } + } + if (best != pick) { + float tv = sscore[pick]; sscore[pick] = sscore[best]; sscore[best] = tv; + uint32_t ti = sidx[pick]; sidx[pick] = sidx[best]; sidx[best] = ti; + } + } + if (n_visible_blocks > 0) { + uint32_t recency = n_visible_blocks - 1u; + bool present = false; + for (uint32_t i = 0; i < m; i++) if (sidx[i] == recency) { present = true; break; } + if (!present) sidx[m - 1u] = recency; + } + bool zero_present = false; + for (uint32_t i = 0; i < m; i++) if (sidx[i] == 0u) { zero_present = true; break; } + if (!zero_present && m >= 2u) sidx[m - 2u] = 0u; + for (uint32_t i = 0; i < m; i++) sel_blocks[i] = sidx[i]; + } +} + +/* Stage 2: refine - score every row inside each selected block. Grid: + * <<>>. One CTA per selected block; the 128 threads cooperate + * to dot-product against each of the up-to-DS4_CUDA_HISA_BLOCK_SIZE rows in + * sequence. Writes into the flat scores[n_comp] array. Non-candidate + * rows must be pre-initialized to -INF (see hisa_scores_init_neg_inf). */ +__global__ static void hisa_refine_scores_kernel( + float *scores, + const float *q, + const float *weights, + const float *index_comp, + const uint32_t *sel_blocks, + uint32_t n_comp, + uint32_t n_visible, + float scale) { + const uint32_t blk_slot = blockIdx.x; + const uint32_t tid = threadIdx.x; + const uint32_t lane = tid & 31u; + const uint32_t warp = tid >> 5u; + if (tid >= 128u) return; + + const uint32_t block_id = sel_blocks[blk_slot]; + const uint32_t row_start = block_id * DS4_CUDA_HISA_BLOCK_SIZE; + if (row_start >= n_comp) return; + + uint32_t row_end = row_start + DS4_CUDA_HISA_BLOCK_SIZE; + if (row_end > n_comp) row_end = n_comp; + + __shared__ float krow[128]; + __shared__ float partial[4]; + + for (uint32_t c = row_start; c < row_end; c++) { + if (c >= n_visible) { + if (tid == 0) scores[c] = -INFINITY; + __syncthreads(); + continue; + } + if (tid < 128u) krow[tid] = index_comp[(uint64_t)c * 128u + tid]; + __syncthreads(); + + float total = 0.0f; + for (uint32_t h0 = 0; h0 < 64u; h0 += 4u) { + const uint32_t h = h0 + warp; + const float4 qv = ((const float4 *)(q + (uint64_t)h * 128u))[lane]; + const float4 kv = ((const float4 *)krow)[lane]; + float dot = qv.x * kv.x + qv.y * kv.y + qv.z * kv.z + qv.w * kv.w; + dot = warp_sum_f32(dot); + if (lane == 0) partial[warp] = fmaxf(dot, 0.0f) * weights[h] * scale; + __syncthreads(); + if (tid == 0) total += partial[0] + partial[1] + partial[2] + partial[3]; + __syncthreads(); + } + if (tid == 0) scores[c] = total; + __syncthreads(); + } +} + +/* Initialize scores[0..n_comp) to -INF so the refine pass can overwrite + * only the candidate rows while non-candidates stay at -INF and drop + * out of the downstream top-K. */ +__global__ static void hisa_scores_init_neg_inf_kernel(float *scores, uint32_t n_comp) { + const uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_comp) scores[i] = -INFINITY; +} + __global__ static void indexer_scores_kernel( float *scores, const float *q, @@ -5631,6 +5836,90 @@ extern "C" int ds4_gpu_indexer_score_one_tensor( n_head, head_dim, 1, scale, 0); } +/* HISA launchers (arxiv 2603.28458). Pair with the existing top-K + * kernel; the scores tensor layout is preserved (per-row floats with + * non-candidate rows at -INF) so downstream code is unchanged. */ + +/* Recompute all n_blocks block reps from index_comp. Cheap at 256K: + * ~512 CTAs each doing a 128-row mean. Called at the top of + * `ds4_gpu_hisa_score_one_tensor` so block_reps reflects the latest + * comp_kv state without needing incremental update wiring in the + * compressor emit path. */ +extern "C" int ds4_gpu_hisa_block_rep_update_tensor( + ds4_gpu_tensor *block_reps, + const ds4_gpu_tensor *index_comp, + uint32_t n_comp, + uint32_t n_blocks) { + if (!block_reps || !index_comp || n_blocks == 0) return 0; + if (block_reps->bytes < (uint64_t)n_blocks * DS4_CUDA_HISA_HEAD_DIM * sizeof(float)) return 0; + if (index_comp->bytes < (uint64_t)n_comp * DS4_CUDA_HISA_HEAD_DIM * sizeof(float)) return 0; + hisa_block_rep_update_kernel<<>>( + (float *)block_reps->ptr, + (const float *)index_comp->ptr, + n_comp, n_blocks); + return cuda_ok(cudaGetLastError(), "hisa_block_rep_update launch"); +} + +/* Decode-token entry point. Score block reps, pick top-m blocks, refine + * within selected blocks. Writes per-row scores in the same layout as + * the flat indexer. Returns 0 on launch error; callers fall back to + * the flat path on a zero return. */ +extern "C" int ds4_gpu_hisa_score_one_tensor( + ds4_gpu_tensor *scores, + ds4_gpu_tensor *sel_blocks, + ds4_gpu_tensor *block_scores, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *weights, + const ds4_gpu_tensor *block_reps, + const ds4_gpu_tensor *index_comp, + uint32_t n_comp, + uint32_t n_blocks, + uint32_t n_visible_blocks, + uint32_t n_visible_rows, + uint32_t m, + float scale) { + if (!scores || !sel_blocks || !block_scores || + !q || !weights || !block_reps || !index_comp) return 0; + if (n_comp == 0u || n_blocks == 0u || m == 0u) return 0; + if (scores->bytes < (uint64_t)n_comp * sizeof(float)) return 0; + if (sel_blocks->bytes < (uint64_t)m * sizeof(uint32_t)) return 0; + if (block_scores->bytes < (uint64_t)n_blocks * sizeof(float)) return 0; + + hisa_block_rep_update_kernel<<>>( + (float *)block_reps->ptr, + (const float *)index_comp->ptr, + n_comp, n_blocks); + if (!cuda_ok(cudaGetLastError(), "hisa block_rep update inline launch")) return 0; + + hisa_scores_init_neg_inf_kernel<<<(n_comp + 255u) / 256u, 256>>>( + (float *)scores->ptr, n_comp); + if (!cuda_ok(cudaGetLastError(), "hisa scores init launch")) return 0; + + hisa_block_scores_kernel<<>>( + (float *)block_scores->ptr, + (const float *)q->ptr, + (const float *)weights->ptr, + (const float *)block_reps->ptr, + n_blocks, n_visible_blocks, scale); + if (!cuda_ok(cudaGetLastError(), "hisa block_scores launch")) return 0; + + const size_t shm_bytes = (size_t)n_blocks * (sizeof(float) + sizeof(uint32_t)); + hisa_block_topm_kernel<<<1, 256, shm_bytes>>>( + (uint32_t *)sel_blocks->ptr, + (const float *)block_scores->ptr, + n_blocks, n_visible_blocks, m); + if (!cuda_ok(cudaGetLastError(), "hisa block_topm launch")) return 0; + + hisa_refine_scores_kernel<<>>( + (float *)scores->ptr, + (const float *)q->ptr, + (const float *)weights->ptr, + (const float *)index_comp->ptr, + (const uint32_t *)sel_blocks->ptr, + n_comp, n_visible_rows, scale); + return cuda_ok(cudaGetLastError(), "hisa refine_scores launch"); +} + extern "C" int ds4_gpu_indexer_scores_prefill_tensor( ds4_gpu_tensor *scores, const ds4_gpu_tensor *q, diff --git a/ds4_gpu.h b/ds4_gpu.h index 2872b46a4..7282852c8 100644 --- a/ds4_gpu.h +++ b/ds4_gpu.h @@ -87,6 +87,33 @@ int ds4_gpu_indexer_score_one_tensor( uint32_t head_dim, float scale); +/* HISA hierarchical indexer (arxiv 2603.28458). Two-stage block-coarse + * + token-refine scan; replaces the flat O(n_comp) indexer at decode- + * token with O(n_blocks + m * BLOCK_SIZE). Block size is 128 rows, so + * at 65K rows the coarse stage walks ~512 blocks instead. Output uses + * the same per-row scores layout as the flat indexer (non-candidate + * rows are -INF) so the downstream top-K kernel runs unchanged. */ +int ds4_gpu_hisa_block_rep_update_tensor( + ds4_gpu_tensor *block_reps, + const ds4_gpu_tensor *index_comp, + uint32_t n_comp, + uint32_t n_blocks); + +int ds4_gpu_hisa_score_one_tensor( + ds4_gpu_tensor *scores, + ds4_gpu_tensor *sel_blocks, + ds4_gpu_tensor *block_scores, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *weights, + const ds4_gpu_tensor *block_reps, + const ds4_gpu_tensor *index_comp, + uint32_t n_comp, + uint32_t n_blocks, + uint32_t n_visible_blocks, + uint32_t n_visible_rows, + uint32_t m, + float scale); + int ds4_gpu_indexer_scores_prefill_tensor( ds4_gpu_tensor *scores, const ds4_gpu_tensor *q, diff --git a/ds4_metal.m b/ds4_metal.m index 465fb6294..439d51a5b 100644 --- a/ds4_metal.m +++ b/ds4_metal.m @@ -5089,6 +5089,39 @@ int ds4_gpu_indexer_score_one_tensor( return 1; } +/* HISA stubs. Metal port is deferred; a zero return causes the caller + * to fall through to the flat indexer path, which the Metal backend + * already supports. */ +int ds4_gpu_hisa_block_rep_update_tensor( + ds4_gpu_tensor *block_reps, + const ds4_gpu_tensor *index_comp, + uint32_t n_comp, + uint32_t n_blocks) { + (void)block_reps; (void)index_comp; (void)n_comp; (void)n_blocks; + return 0; +} + +int ds4_gpu_hisa_score_one_tensor( + ds4_gpu_tensor *scores, + ds4_gpu_tensor *sel_blocks, + ds4_gpu_tensor *block_scores, + const ds4_gpu_tensor *q, + const ds4_gpu_tensor *weights, + const ds4_gpu_tensor *block_reps, + const ds4_gpu_tensor *index_comp, + uint32_t n_comp, + uint32_t n_blocks, + uint32_t n_visible_blocks, + uint32_t n_visible_rows, + uint32_t m, + float scale) { + (void)scores; (void)sel_blocks; (void)block_scores; + (void)q; (void)weights; (void)block_reps; (void)index_comp; + (void)n_comp; (void)n_blocks; (void)n_visible_blocks; + (void)n_visible_rows; (void)m; (void)scale; + return 0; +} + static int ds4_gpu_indexer_scores_batch_tensor( ds4_gpu_tensor *scores, const ds4_gpu_tensor *q, diff --git a/speed-bench/hisa/README.md b/speed-bench/hisa/README.md new file mode 100644 index 000000000..fbd346e12 --- /dev/null +++ b/speed-bench/hisa/README.md @@ -0,0 +1,57 @@ +# HISA hierarchical indexer bench + +`gb10_spark.csv` is raw `ds4-bench --csv` output from this branch on a +GB10 (ASUS Ascent, sm_121, 128 GB unified memory) running Qwen3.6-A3B +IQ2XXS with `--backend cuda --kv-cache turbo3 --comp-cache turbo3` and +inline-dequant comp_kv enabled (the configuration that exposes HISA at +long context). + +Two rows, two context points: + +| ctx | n_index_comp | HISA dispatch | gen_tps | +|---:|---:|---|---:| +| 65536 | 16394 | dormant (under gate, flat indexer runs) | 11.01 | +| 262144 | 65542 | active (over gate, HISA runs) | 7.61 | + +The 64K row confirms zero regression when the gate keeps HISA off; the +256K row is the long-context point where HISA replaces the flat scan. + +## Reproduce + +This branch, applied on top of #243 (for `--kv-cache turbo3 --comp-cache +turbo3`): + +```sh +make cuda-spark +./ds4-bench -m ds4flash.gguf \ + --backend cuda --kv-cache turbo3 --comp-cache turbo3 \ + --prompt-file speed-bench/promessi_sposi.txt \ + --ctx-start 65536 --ctx-max 262144 --step-incr 196608 \ + --gen-tokens 32 \ + --csv speed-bench/hisa/gb10_spark.csv +``` + +`prefill_tps` is unchanged from the parent commit at both ctx points; +HISA is a decode-token optimization and the prefill batched-attention +path is untouched. + +## What this CSV does not contain + +A `--gen-tokens 128` sweep at the canonical 2K..256K step-incr=16384 +granularity and matched per-dtype before/after CSVs across `fp8`, +`turbo4`, and `turbo3+comp` are queued. The compact deltas in the PR +body for `fp8`, `turbo4`, and `turbo3+comp` come from separate +single-point runs from the same session that were not preserved as raw +`--csv` output; they will be regenerated and added here alongside the +full sweep. + +## Perplexity + +Teacher-forced PPL on the same model and prompt, 64 scored tokens: + +``` +ds4-bench: PPL teacher-forced kv_cache=turbo3 tokens=64 scored=63 elapsed=4.06s +ds4-bench: nll_avg=4.674636 ppl=107.193523 +``` + +Identical to the parent-commit baseline at the same configuration. diff --git a/speed-bench/hisa/gb10_spark.csv b/speed-bench/hisa/gb10_spark.csv new file mode 100644 index 000000000..c35298a5f --- /dev/null +++ b/speed-bench/hisa/gb10_spark.csv @@ -0,0 +1,3 @@ +ctx_tokens,prefill_tokens,prefill_tps,gen_tokens,gen_tps,kvcache_bytes +65536,65536,341.41,32,11.01,262379536 +262144,262144,236.89,16,7.61,1004230672