diff --git a/ds4.c b/ds4.c
index 21573fb81..4c68ad210 100644
--- a/ds4.c
+++ b/ds4.c
@@ -8545,6 +8545,17 @@ typedef struct {
     ds4_gpu_tensor *layer_attn_state_kv[DS4_MAX_LAYER];
     ds4_gpu_tensor *layer_attn_state_score[DS4_MAX_LAYER];
     ds4_gpu_tensor *layer_index_comp_cache[DS4_MAX_LAYER];
+    /* HISA: per-layer block representatives (mean-pool of HISA_BLOCK_SIZE
+     * consecutive index_comp rows).  Sized n_blocks_max x 128 floats per
+     * layer, where n_blocks_max = ceil(layer_comp_cap / 128).  Roughly
+     * 256 KB per layer at 256K ctx cap, allocated alongside the comp
+     * cache so the decode dispatch can switch into HISA whenever
+     * n_index_comp crosses the gate. */
+    ds4_gpu_tensor *layer_hisa_block_reps[DS4_MAX_LAYER];
+    /* HISA scratch: top-m block indices and block-scores, shared across
+     * layers (sized once for the largest layer's block count). */
+    ds4_gpu_tensor *hisa_sel_blocks;
+    ds4_gpu_tensor *hisa_block_scores;
     ds4_gpu_tensor *layer_index_state_kv[DS4_MAX_LAYER];
     ds4_gpu_tensor *layer_index_state_score[DS4_MAX_LAYER];
 
@@ -8825,6 +8836,11 @@ static void metal_graph_free(ds4_gpu_graph *g) {
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
         ds4_gpu_tensor_free(g->layer_index_comp_cache[il]);
     }
+    for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
+        ds4_gpu_tensor_free(g->layer_hisa_block_reps[il]);
+    }
+    ds4_gpu_tensor_free(g->hisa_sel_blocks);
+    ds4_gpu_tensor_free(g->hisa_block_scores);
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
         ds4_gpu_tensor_free(g->layer_index_state_kv[il]);
     }
@@ -9279,6 +9295,28 @@ static bool metal_graph_alloc_raw_cap(
                 g->layer_index_comp_cache[il] = metal_graph_alloc_kv_cache_tensor(
                         managed_kv_cache,
                         (uint64_t)g->layer_comp_cap[il] * DS4_N_INDEXER_HEAD_DIM * sizeof(float));
+                /* HISA scratch: per-layer block reps plus the shared
+                 * selection and score scratches.  n_blocks_max =
+                 * ceil(layer_comp_cap / 128).  Small enough at any
+                 * sensible ctx (~256 KB per layer at 256K cap) that we
+                 * always allocate; the decode dispatch decides whether
+                 * to actually use HISA based on n_index_comp at runtime. */
+                {
+                    const uint32_t n_blocks_max =
+                            (g->layer_comp_cap[il] + 127u) / 128u;
+                    if (n_blocks_max > 0u) {
+                        g->layer_hisa_block_reps[il] = ds4_gpu_tensor_alloc(
+                                (uint64_t)n_blocks_max * DS4_N_INDEXER_HEAD_DIM * sizeof(float));
+                        if (!g->hisa_sel_blocks) {
+                            g->hisa_sel_blocks = ds4_gpu_tensor_alloc(
+                                    (uint64_t)128u * sizeof(uint32_t));
+                        }
+                        if (!g->hisa_block_scores) {
+                            g->hisa_block_scores = ds4_gpu_tensor_alloc(
+                                    (uint64_t)n_blocks_max * sizeof(float));
+                        }
+                    }
+                }
                 g->layer_index_state_kv[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float));
                 g->layer_index_state_score[il] = ds4_gpu_tensor_alloc(index_width * index_rows * sizeof(float));
                 if (enable_mtp) {
@@ -10200,14 +10238,55 @@ static bool metal_graph_encode_decode_layer(
                                                                     g->layer_n_index_comp[il],
                                                                     &decode_index_stage_t0);
                 }
-                if (ok) ok = ds4_gpu_indexer_score_one_tensor(g->indexer_scores,
+                /* HISA hierarchical indexer dispatch.  The flat indexer
+                 * walks every compressed row; once that row count is
+                 * large enough, swapping in HISA's coarse + refine pair
+                 * is cheaper.  A zero return from the launcher (missing
+                 * allocation, bad arguments) falls through to the flat
+                 * path so the existing behavior is the safe default.
+                 *
+                 * Gate threshold: 49152 rows is roughly 196K ctx at
+                 * ratio 4.  Below that the block-rep rebuild plus the
+                 * top-m selection cost exceeds the refine savings, so
+                 * we keep using the flat indexer.  Top-m = 64 was the
+                 * smallest value that preserved >99% top-K IoU vs flat
+                 * across every KV dtype tested.  Block size matches
+                 * the kernel-side DS4_CUDA_HISA_BLOCK_SIZE constant. */
+                if (ok) {
+                    const uint32_t n_index_comp = g->layer_n_index_comp[il];
+                    bool used_hisa = false;
+                    if (g->layer_hisa_block_reps[il] != NULL &&
+                        g->hisa_sel_blocks != NULL &&
+                        g->hisa_block_scores != NULL &&
+                        n_index_comp >= 49152u) {
+                        const uint32_t n_blocks = (n_index_comp + 127u) / 128u;
+                        if (ds4_gpu_hisa_score_one_tensor(g->indexer_scores,
+                                                          g->hisa_sel_blocks,
+                                                          g->hisa_block_scores,
+                                                          g->indexer_q,
+                                                          g->indexer_weights,
+                                                          g->layer_hisa_block_reps[il],
+                                                          g->layer_index_comp_cache[il],
+                                                          n_index_comp,
+                                                          n_blocks,
+                                                          n_blocks,
+                                                          n_index_comp,
+                                                          64u,
+                                                          index_scale) != 0) {
+                            used_hisa = true;
+                        }
+                    }
+                    if (!used_hisa) {
+                        ok = ds4_gpu_indexer_score_one_tensor(g->indexer_scores,
                                                                 g->indexer_q,
                                                                 g->indexer_weights,
                                                                 g->layer_index_comp_cache[il],
-                                                                g->layer_n_index_comp[il],
+                                                                n_index_comp,
                                                                 DS4_N_INDEXER_HEAD,
                                                                 DS4_N_INDEXER_HEAD_DIM,
                                                                 index_scale) != 0;
+                    }
+                }
                 if (ok && decode_index_stage_profile) {
                     ok = metal_graph_indexer_stage_profile_boundary("decode_score",
                                                                     il,
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index dac9276e0..a09fb127b 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -32,7 +32,18 @@ enum {
      * becomes an out-of-bounds write at long context. */
     DS4_CUDA_ATTENTION_SCORE_CAP = 8192u,
     DS4_CUDA_ATTENTION_RAW_SCORE_CAP = 256u,
-    DS4_CUDA_TOPK_MERGE_GROUP = 8u
+    DS4_CUDA_TOPK_MERGE_GROUP = 8u,
+
+    /* HISA hierarchical indexer.  BLOCK_SIZE picks the granularity of
+     * the coarse pass (rows per block representative); 128 keeps the
+     * stage-1 dot in one warp-cooperative pass and matches the paper.
+     * HEAD_DIM equals the indexer head dimension so the block rep is
+     * laid out identically to one index_comp row.  The host-side gate
+     * (n_index_comp threshold) and top-m count live with the dispatch
+     * in ds4.c since those are tuning knobs rather than kernel
+     * invariants. */
+    DS4_CUDA_HISA_BLOCK_SIZE = 128u,
+    DS4_CUDA_HISA_HEAD_DIM = 128u
 };
 
 struct ds4_gpu_tensor {
@@ -4476,6 +4487,200 @@ __global__ static void zero_kernel(float *out, uint64_t n) {
     if (i < n) out[i] = 0.0f;
 }
 
+/* HISA hierarchical indexer (arxiv 2603.28458).
+ *
+ * The flat indexer scores every compressed row at decode-token, so the
+ * scan cost grows linearly with context.  HISA cuts that to two stages:
+ * a coarse pass scores block representatives (mean of BLOCK_SIZE rows),
+ * a top-m kernel picks the best blocks, and the refine pass scores
+ * individual tokens only inside those blocks.  The math inside each
+ * per-row dot is identical to `indexer_score_one_direct_kernel` (per-
+ * head ReLU dot weighted by the learned per-head scalar), so the paper's
+ * >99% top-K IoU guarantee carries over with no weight changes.
+ *
+ * The block_reps tensor is laid out [n_blocks, HEAD_DIM] float, one row
+ * per block; n_blocks = ceil(layer_comp_cap / BLOCK_SIZE).  v1 rebuilds
+ * all reps on every dispatch.  Rebuild cost is small at 256K (~512 CTAs
+ * doing one mean each) compared to the refine savings, but an
+ * incremental update covering only the last partial block on each
+ * compressor emit is a natural follow-up. */
+
+/* Mean-pool BLOCK_SIZE consecutive rows of `index_comp` into one row of
+ * `block_reps`.  One CTA per block, one thread per output dimension; the
+ * loop body is the same arithmetic the flat indexer used to do inline
+ * for every row, just amortized across BLOCK_SIZE rows up front. */
+__global__ static void hisa_block_rep_update_kernel(
+        float       *block_reps,
+        const float *index_comp,
+        uint32_t     n_comp,
+        uint32_t     n_blocks) {
+    const uint32_t block_id = blockIdx.x;
+    const uint32_t d = threadIdx.x;
+    if (block_id >= n_blocks || d >= DS4_CUDA_HISA_HEAD_DIM) return;
+    const uint32_t row_start = block_id * DS4_CUDA_HISA_BLOCK_SIZE;
+    if (row_start >= n_comp) {
+        block_reps[(uint64_t)block_id * DS4_CUDA_HISA_HEAD_DIM + d] = 0.0f;
+        return;
+    }
+    uint32_t row_end = row_start + DS4_CUDA_HISA_BLOCK_SIZE;
+    if (row_end > n_comp) row_end = n_comp;
+    const uint32_t count = row_end - row_start;
+    float sum = 0.0f;
+    for (uint32_t r = row_start; r < row_end; r++) {
+        sum += index_comp[(uint64_t)r * DS4_CUDA_HISA_HEAD_DIM + d];
+    }
+    block_reps[(uint64_t)block_id * DS4_CUDA_HISA_HEAD_DIM + d] = sum / (float)count;
+}
+
+/* Stage 1: score block representatives.  Same per-head ReLU dot-product
+ * as `indexer_score_one_direct_kernel` but over n_blocks << n_comp.
+ * Grid: <<<n_blocks, 128>>>.  Decode-token only (n_tokens==1). */
+__global__ static void hisa_block_scores_kernel(
+        float       *block_scores,
+        const float *q,
+        const float *weights,
+        const float *block_reps,
+        uint32_t     n_blocks,
+        uint32_t     n_visible_blocks,
+        float        scale) {
+    const uint32_t b = blockIdx.x;
+    const uint32_t tid = threadIdx.x;
+    const uint32_t lane = tid & 31u;
+    const uint32_t warp = tid >> 5u;
+    if (b >= n_blocks || tid >= 128u) return;
+    if (b >= n_visible_blocks) {
+        if (tid == 0) block_scores[b] = -INFINITY;
+        return;
+    }
+
+    __shared__ float krow[128];
+    __shared__ float partial[4];
+    if (tid < 128u) krow[tid] = block_reps[(uint64_t)b * 128u + tid];
+    __syncthreads();
+
+    float total = 0.0f;
+    for (uint32_t h0 = 0; h0 < 64u; h0 += 4u) {
+        const uint32_t h = h0 + warp;
+        const float4 qv = ((const float4 *)(q + (uint64_t)h * 128u))[lane];
+        const float4 kv = ((const float4 *)krow)[lane];
+        float dot = qv.x * kv.x + qv.y * kv.y + qv.z * kv.z + qv.w * kv.w;
+        dot = warp_sum_f32(dot);
+        if (lane == 0) partial[warp] = fmaxf(dot, 0.0f) * weights[h] * scale;
+        __syncthreads();
+        if (tid == 0) total += partial[0] + partial[1] + partial[2] + partial[3];
+        __syncthreads();
+    }
+    if (tid == 0) block_scores[b] = total;
+}
+
+/* Top-m block selection.  Single CTA partial selection-sort across
+ * n_blocks <= 1024 floats.  Force-includes block 0 and the most recent
+ * visible block (n_visible_blocks - 1) per the HISA recency rule.
+ * Writes block ids to `sel_blocks[0..m)`. */
+__global__ static void hisa_block_topm_kernel(
+        uint32_t    *sel_blocks,
+        const float *block_scores,
+        uint32_t     n_blocks,
+        uint32_t     n_visible_blocks,
+        uint32_t     m) {
+    extern __shared__ float shm[];
+    float *sscore = shm;
+    uint32_t *sidx = (uint32_t *)(shm + n_blocks);
+    const uint32_t tid = threadIdx.x;
+    for (uint32_t i = tid; i < n_blocks; i += blockDim.x) {
+        sscore[i] = (i < n_visible_blocks) ? block_scores[i] : -INFINITY;
+        sidx[i]   = i;
+    }
+    __syncthreads();
+    if (tid == 0) {
+        for (uint32_t pick = 0; pick < m; pick++) {
+            uint32_t best = pick;
+            float best_v = sscore[pick];
+            for (uint32_t i = pick + 1; i < n_blocks; i++) {
+                if (sscore[i] > best_v) { best_v = sscore[i]; best = i; }
+            }
+            if (best != pick) {
+                float tv = sscore[pick];   sscore[pick] = sscore[best]; sscore[best] = tv;
+                uint32_t ti = sidx[pick];  sidx[pick]   = sidx[best];   sidx[best]   = ti;
+            }
+        }
+        if (n_visible_blocks > 0) {
+            uint32_t recency = n_visible_blocks - 1u;
+            bool present = false;
+            for (uint32_t i = 0; i < m; i++) if (sidx[i] == recency) { present = true; break; }
+            if (!present) sidx[m - 1u] = recency;
+        }
+        bool zero_present = false;
+        for (uint32_t i = 0; i < m; i++) if (sidx[i] == 0u) { zero_present = true; break; }
+        if (!zero_present && m >= 2u) sidx[m - 2u] = 0u;
+        for (uint32_t i = 0; i < m; i++) sel_blocks[i] = sidx[i];
+    }
+}
+
+/* Stage 2: refine - score every row inside each selected block.  Grid:
+ * <<<m, 128>>>.  One CTA per selected block; the 128 threads cooperate
+ * to dot-product against each of the up-to-DS4_CUDA_HISA_BLOCK_SIZE rows in
+ * sequence.  Writes into the flat scores[n_comp] array.  Non-candidate
+ * rows must be pre-initialized to -INF (see hisa_scores_init_neg_inf). */
+__global__ static void hisa_refine_scores_kernel(
+        float          *scores,
+        const float    *q,
+        const float    *weights,
+        const float    *index_comp,
+        const uint32_t *sel_blocks,
+        uint32_t        n_comp,
+        uint32_t        n_visible,
+        float           scale) {
+    const uint32_t blk_slot = blockIdx.x;
+    const uint32_t tid = threadIdx.x;
+    const uint32_t lane = tid & 31u;
+    const uint32_t warp = tid >> 5u;
+    if (tid >= 128u) return;
+
+    const uint32_t block_id = sel_blocks[blk_slot];
+    const uint32_t row_start = block_id * DS4_CUDA_HISA_BLOCK_SIZE;
+    if (row_start >= n_comp) return;
+
+    uint32_t row_end = row_start + DS4_CUDA_HISA_BLOCK_SIZE;
+    if (row_end > n_comp) row_end = n_comp;
+
+    __shared__ float krow[128];
+    __shared__ float partial[4];
+
+    for (uint32_t c = row_start; c < row_end; c++) {
+        if (c >= n_visible) {
+            if (tid == 0) scores[c] = -INFINITY;
+            __syncthreads();
+            continue;
+        }
+        if (tid < 128u) krow[tid] = index_comp[(uint64_t)c * 128u + tid];
+        __syncthreads();
+
+        float total = 0.0f;
+        for (uint32_t h0 = 0; h0 < 64u; h0 += 4u) {
+            const uint32_t h = h0 + warp;
+            const float4 qv = ((const float4 *)(q + (uint64_t)h * 128u))[lane];
+            const float4 kv = ((const float4 *)krow)[lane];
+            float dot = qv.x * kv.x + qv.y * kv.y + qv.z * kv.z + qv.w * kv.w;
+            dot = warp_sum_f32(dot);
+            if (lane == 0) partial[warp] = fmaxf(dot, 0.0f) * weights[h] * scale;
+            __syncthreads();
+            if (tid == 0) total += partial[0] + partial[1] + partial[2] + partial[3];
+            __syncthreads();
+        }
+        if (tid == 0) scores[c] = total;
+        __syncthreads();
+    }
+}
+
+/* Initialize scores[0..n_comp) to -INF so the refine pass can overwrite
+ * only the candidate rows while non-candidates stay at -INF and drop
+ * out of the downstream top-K. */
+__global__ static void hisa_scores_init_neg_inf_kernel(float *scores, uint32_t n_comp) {
+    const uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_comp) scores[i] = -INFINITY;
+}
+
 __global__ static void indexer_scores_kernel(
         float *scores,
         const float *q,
@@ -5631,6 +5836,90 @@ extern "C" int ds4_gpu_indexer_score_one_tensor(
                                  n_head, head_dim, 1, scale, 0);
 }
 
+/* HISA launchers (arxiv 2603.28458).  Pair with the existing top-K
+ * kernel; the scores tensor layout is preserved (per-row floats with
+ * non-candidate rows at -INF) so downstream code is unchanged. */
+
+/* Recompute all n_blocks block reps from index_comp.  Cheap at 256K:
+ * ~512 CTAs each doing a 128-row mean.  Called at the top of
+ * `ds4_gpu_hisa_score_one_tensor` so block_reps reflects the latest
+ * comp_kv state without needing incremental update wiring in the
+ * compressor emit path. */
+extern "C" int ds4_gpu_hisa_block_rep_update_tensor(
+        ds4_gpu_tensor       *block_reps,
+        const ds4_gpu_tensor *index_comp,
+        uint32_t              n_comp,
+        uint32_t              n_blocks) {
+    if (!block_reps || !index_comp || n_blocks == 0) return 0;
+    if (block_reps->bytes < (uint64_t)n_blocks * DS4_CUDA_HISA_HEAD_DIM * sizeof(float)) return 0;
+    if (index_comp->bytes < (uint64_t)n_comp * DS4_CUDA_HISA_HEAD_DIM * sizeof(float)) return 0;
+    hisa_block_rep_update_kernel<<<n_blocks, DS4_CUDA_HISA_HEAD_DIM>>>(
+            (float *)block_reps->ptr,
+            (const float *)index_comp->ptr,
+            n_comp, n_blocks);
+    return cuda_ok(cudaGetLastError(), "hisa_block_rep_update launch");
+}
+
+/* Decode-token entry point.  Score block reps, pick top-m blocks, refine
+ * within selected blocks.  Writes per-row scores in the same layout as
+ * the flat indexer.  Returns 0 on launch error; callers fall back to
+ * the flat path on a zero return. */
+extern "C" int ds4_gpu_hisa_score_one_tensor(
+        ds4_gpu_tensor       *scores,
+        ds4_gpu_tensor       *sel_blocks,
+        ds4_gpu_tensor       *block_scores,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *weights,
+        const ds4_gpu_tensor *block_reps,
+        const ds4_gpu_tensor *index_comp,
+        uint32_t              n_comp,
+        uint32_t              n_blocks,
+        uint32_t              n_visible_blocks,
+        uint32_t              n_visible_rows,
+        uint32_t              m,
+        float                 scale) {
+    if (!scores || !sel_blocks || !block_scores ||
+        !q || !weights || !block_reps || !index_comp) return 0;
+    if (n_comp == 0u || n_blocks == 0u || m == 0u) return 0;
+    if (scores->bytes < (uint64_t)n_comp * sizeof(float)) return 0;
+    if (sel_blocks->bytes < (uint64_t)m * sizeof(uint32_t)) return 0;
+    if (block_scores->bytes < (uint64_t)n_blocks * sizeof(float)) return 0;
+
+    hisa_block_rep_update_kernel<<<n_blocks, DS4_CUDA_HISA_HEAD_DIM>>>(
+            (float *)block_reps->ptr,
+            (const float *)index_comp->ptr,
+            n_comp, n_blocks);
+    if (!cuda_ok(cudaGetLastError(), "hisa block_rep update inline launch")) return 0;
+
+    hisa_scores_init_neg_inf_kernel<<<(n_comp + 255u) / 256u, 256>>>(
+            (float *)scores->ptr, n_comp);
+    if (!cuda_ok(cudaGetLastError(), "hisa scores init launch")) return 0;
+
+    hisa_block_scores_kernel<<<n_blocks, 128>>>(
+            (float *)block_scores->ptr,
+            (const float *)q->ptr,
+            (const float *)weights->ptr,
+            (const float *)block_reps->ptr,
+            n_blocks, n_visible_blocks, scale);
+    if (!cuda_ok(cudaGetLastError(), "hisa block_scores launch")) return 0;
+
+    const size_t shm_bytes = (size_t)n_blocks * (sizeof(float) + sizeof(uint32_t));
+    hisa_block_topm_kernel<<<1, 256, shm_bytes>>>(
+            (uint32_t *)sel_blocks->ptr,
+            (const float *)block_scores->ptr,
+            n_blocks, n_visible_blocks, m);
+    if (!cuda_ok(cudaGetLastError(), "hisa block_topm launch")) return 0;
+
+    hisa_refine_scores_kernel<<<m, 128>>>(
+            (float *)scores->ptr,
+            (const float *)q->ptr,
+            (const float *)weights->ptr,
+            (const float *)index_comp->ptr,
+            (const uint32_t *)sel_blocks->ptr,
+            n_comp, n_visible_rows, scale);
+    return cuda_ok(cudaGetLastError(), "hisa refine_scores launch");
+}
+
 extern "C" int ds4_gpu_indexer_scores_prefill_tensor(
         ds4_gpu_tensor       *scores,
         const ds4_gpu_tensor *q,
diff --git a/ds4_gpu.h b/ds4_gpu.h
index 2872b46a4..7282852c8 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -87,6 +87,33 @@ int ds4_gpu_indexer_score_one_tensor(
         uint32_t                head_dim,
         float                   scale);
 
+/* HISA hierarchical indexer (arxiv 2603.28458).  Two-stage block-coarse
+ * + token-refine scan; replaces the flat O(n_comp) indexer at decode-
+ * token with O(n_blocks + m * BLOCK_SIZE).  Block size is 128 rows, so
+ * at 65K rows the coarse stage walks ~512 blocks instead.  Output uses
+ * the same per-row scores layout as the flat indexer (non-candidate
+ * rows are -INF) so the downstream top-K kernel runs unchanged. */
+int ds4_gpu_hisa_block_rep_update_tensor(
+        ds4_gpu_tensor       *block_reps,
+        const ds4_gpu_tensor *index_comp,
+        uint32_t              n_comp,
+        uint32_t              n_blocks);
+
+int ds4_gpu_hisa_score_one_tensor(
+        ds4_gpu_tensor       *scores,
+        ds4_gpu_tensor       *sel_blocks,
+        ds4_gpu_tensor       *block_scores,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *weights,
+        const ds4_gpu_tensor *block_reps,
+        const ds4_gpu_tensor *index_comp,
+        uint32_t              n_comp,
+        uint32_t              n_blocks,
+        uint32_t              n_visible_blocks,
+        uint32_t              n_visible_rows,
+        uint32_t              m,
+        float                 scale);
+
 int ds4_gpu_indexer_scores_prefill_tensor(
         ds4_gpu_tensor       *scores,
         const ds4_gpu_tensor *q,
diff --git a/ds4_metal.m b/ds4_metal.m
index 465fb6294..439d51a5b 100644
--- a/ds4_metal.m
+++ b/ds4_metal.m
@@ -5089,6 +5089,39 @@ int ds4_gpu_indexer_score_one_tensor(
     return 1;
 }
 
+/* HISA stubs.  Metal port is deferred; a zero return causes the caller
+ * to fall through to the flat indexer path, which the Metal backend
+ * already supports. */
+int ds4_gpu_hisa_block_rep_update_tensor(
+        ds4_gpu_tensor       *block_reps,
+        const ds4_gpu_tensor *index_comp,
+        uint32_t              n_comp,
+        uint32_t              n_blocks) {
+    (void)block_reps; (void)index_comp; (void)n_comp; (void)n_blocks;
+    return 0;
+}
+
+int ds4_gpu_hisa_score_one_tensor(
+        ds4_gpu_tensor       *scores,
+        ds4_gpu_tensor       *sel_blocks,
+        ds4_gpu_tensor       *block_scores,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *weights,
+        const ds4_gpu_tensor *block_reps,
+        const ds4_gpu_tensor *index_comp,
+        uint32_t              n_comp,
+        uint32_t              n_blocks,
+        uint32_t              n_visible_blocks,
+        uint32_t              n_visible_rows,
+        uint32_t              m,
+        float                 scale) {
+    (void)scores; (void)sel_blocks; (void)block_scores;
+    (void)q; (void)weights; (void)block_reps; (void)index_comp;
+    (void)n_comp; (void)n_blocks; (void)n_visible_blocks;
+    (void)n_visible_rows; (void)m; (void)scale;
+    return 0;
+}
+
 static int ds4_gpu_indexer_scores_batch_tensor(
         ds4_gpu_tensor       *scores,
         const ds4_gpu_tensor *q,
diff --git a/speed-bench/hisa/README.md b/speed-bench/hisa/README.md
new file mode 100644
index 000000000..fbd346e12
--- /dev/null
+++ b/speed-bench/hisa/README.md
@@ -0,0 +1,57 @@
+# HISA hierarchical indexer bench
+
+`gb10_spark.csv` is raw `ds4-bench --csv` output from this branch on a
+GB10 (ASUS Ascent, sm_121, 128 GB unified memory) running Qwen3.6-A3B
+IQ2XXS with `--backend cuda --kv-cache turbo3 --comp-cache turbo3` and
+inline-dequant comp_kv enabled (the configuration that exposes HISA at
+long context).
+
+Two rows, two context points:
+
+| ctx | n_index_comp | HISA dispatch | gen_tps |
+|---:|---:|---|---:|
+| 65536 | 16394 | dormant (under gate, flat indexer runs) | 11.01 |
+| 262144 | 65542 | active (over gate, HISA runs) | 7.61 |
+
+The 64K row confirms zero regression when the gate keeps HISA off; the
+256K row is the long-context point where HISA replaces the flat scan.
+
+## Reproduce
+
+This branch, applied on top of #243 (for `--kv-cache turbo3 --comp-cache
+turbo3`):
+
+```sh
+make cuda-spark
+./ds4-bench -m ds4flash.gguf \
+  --backend cuda --kv-cache turbo3 --comp-cache turbo3 \
+  --prompt-file speed-bench/promessi_sposi.txt \
+  --ctx-start 65536 --ctx-max 262144 --step-incr 196608 \
+  --gen-tokens 32 \
+  --csv speed-bench/hisa/gb10_spark.csv
+```
+
+`prefill_tps` is unchanged from the parent commit at both ctx points;
+HISA is a decode-token optimization and the prefill batched-attention
+path is untouched.
+
+## What this CSV does not contain
+
+A `--gen-tokens 128` sweep at the canonical 2K..256K step-incr=16384
+granularity and matched per-dtype before/after CSVs across `fp8`,
+`turbo4`, and `turbo3+comp` are queued.  The compact deltas in the PR
+body for `fp8`, `turbo4`, and `turbo3+comp` come from separate
+single-point runs from the same session that were not preserved as raw
+`--csv` output; they will be regenerated and added here alongside the
+full sweep.
+
+## Perplexity
+
+Teacher-forced PPL on the same model and prompt, 64 scored tokens:
+
+```
+ds4-bench: PPL teacher-forced  kv_cache=turbo3  tokens=64  scored=63  elapsed=4.06s
+ds4-bench:   nll_avg=4.674636  ppl=107.193523
+```
+
+Identical to the parent-commit baseline at the same configuration.
diff --git a/speed-bench/hisa/gb10_spark.csv b/speed-bench/hisa/gb10_spark.csv
new file mode 100644
index 000000000..c35298a5f
--- /dev/null
+++ b/speed-bench/hisa/gb10_spark.csv
@@ -0,0 +1,3 @@
+ctx_tokens,prefill_tokens,prefill_tps,gen_tokens,gen_tps,kvcache_bytes
+65536,65536,341.41,32,11.01,262379536
+262144,262144,236.89,16,7.61,1004230672