antirez · hexxyan · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/Makefile b/Makefile
@@ -16,8 +16,8 @@ METAL_SRCS := $(wildcard metal/*.metal)
 
 ifeq ($(UNAME_S),Darwin)
 METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
-CORE_OBJS = ds4.o ds4_metal.o
-CPU_CORE_OBJS = ds4_cpu.o
+CORE_OBJS = ds4.o ds4_metal.o ds4_planar_quant.o
+CPU_CORE_OBJS = ds4_cpu.o ds4_planar_quant.o
 else
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
 CUDA_HOME ?= /usr/local/cuda
@@ -28,12 +28,12 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
 endif
 NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
-CORE_OBJS = ds4.o ds4_cuda.o
-CPU_CORE_OBJS = ds4_cpu.o
+CORE_OBJS = ds4.o ds4_cuda.o ds4_planar_quant.o
+CPU_CORE_OBJS = ds4_cpu.o ds4_planar_quant.o
 METAL_LDLIBS := $(LDLIBS)
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
+.PHONY: all help clean test planar-quant-test planar-eval cpu cuda cuda-spark cuda-generic cuda-regression
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
@@ -154,6 +154,21 @@ tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
 rax.o: rax.c rax.h rax_malloc.h
 	$(CC) $(CFLAGS) -c -o $@ rax.c
 
+ds4_planar_quant.o: ds4_planar_quant.c ds4_planar_quant.h
+	$(CC) $(CFLAGS) -c -o $@ ds4_planar_quant.c
+
+tests/planar_quant_test: tests/planar_quant_test.c ds4_planar_quant.c ds4_planar_quant.h
+	$(CC) $(CFLAGS) -I. -o $@ tests/planar_quant_test.c ds4_planar_quant.c $(LDLIBS)
+
+planar-quant-test: tests/planar_quant_test
+	./tests/planar_quant_test
+
+tools/planar_eval: tools/planar_eval.c ds4_planar_quant.c ds4_planar_quant.h
+	$(CC) $(CFLAGS) -I. -o $@ tools/planar_eval.c ds4_planar_quant.c $(LDLIBS)
+
+planar-eval: tools/planar_eval
+	./tools/planar_eval --mode ds4_like --rows 10000 --queries 8
+
 linenoise.o: linenoise.c linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ linenoise.c
 
@@ -191,9 +206,9 @@ else
 	$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
 endif
 
-test: ds4_test ds4-eval
+test: ds4_test ds4-eval planar-quant-test
 	./ds4-eval --self-test-extractors
 	./ds4_test
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/planar_quant_test tools/planar_eval
diff --git a/ds4.c b/ds4.c
diff --git a/ds4.h b/ds4.h
@@ -73,6 +73,9 @@ typedef struct {
     bool warm_weights;
     bool quality;
     bool inspect_only;
+    const char *dump_comp_kv;
+    bool planar_kv_cache;
+    bool planar_kv_cache_only;
 } ds4_engine_options;
 
 typedef void (*ds4_token_emit_fn)(void *ud, int token);
@@ -201,6 +204,7 @@ void ds4_session_invalidate(ds4_session *s);
 void ds4_session_rewind(ds4_session *s, int pos);
 int ds4_session_pos(ds4_session *s);
 int ds4_session_ctx(ds4_session *s);
+int ds4_session_dump_comp_kv(ds4_session *s, const char *path, char *err, size_t errlen);
 int ds4_engine_routed_quant_bits(ds4_engine *e);
 bool ds4_engine_has_mtp(ds4_engine *e);
 int ds4_engine_mtp_draft_tokens(ds4_engine *e);

diff --git a/ds4_cli.c b/ds4_cli.c
@@ -160,6 +160,13 @@ static void usage(FILE *fp) {
         "Diagnostics:\n"
         "  --inspect\n"
         "      Load the model and print a summary only.\n"
+        "  --dump-comp-kv FILE\n"
+        "      Dump compressed KV rows (binary) after prefill for offline Planar3 eval.\n"
+        "  --planar-kv-cache\n"
+        "      Enable Planar3 quantization for compressed attention KV cache.\n"
+        "  --planar-kv-cache-only\n"
+        "      Only keep Planar3 compressed KV cache, skip FP16/FP32 allocation.\n"
+        "      Implies --planar-kv-cache. Saves memory at the cost of dequant overhead.\n"
         "  --dump-tokens\n"
         "      Tokenize -p/--prompt-file exactly as written, then exit without inference.\n"
         "  --dump-logits FILE\n"
@@ -509,6 +516,15 @@ static int run_sampled_generation(ds4_engine *engine, const cli_config *cfg, con
     }
     ds4_session_set_progress(session, NULL, NULL);
     ds4_session_set_display_progress(session, NULL, NULL);
+
+    if (cfg->engine.dump_comp_kv) {
+        if (ds4_session_dump_comp_kv(session, cfg->engine.dump_comp_kv, err, sizeof(err)) != 0) {
+            fprintf(stderr, "ds4: compressed KV dump failed: %s\n", err);
+            ds4_session_free(session);
+            return 1;
+        }
+    }
+
     const double t_prefill1 = cli_now_sec();
 
     int max_tokens = cfg->gen.n_predict;
@@ -1537,6 +1553,13 @@ static cli_config parse_options(int argc, char **argv) {
             exit(2);
         } else if (!strcmp(arg, "--inspect")) {
             c.inspect = true;
+        } else if (!strcmp(arg, "--dump-comp-kv")) {
+            c.engine.dump_comp_kv = need_arg(&i, argc, argv, arg);
+        } else if (!strcmp(arg, "--planar-kv-cache")) {
+            c.engine.planar_kv_cache = true;
+        } else if (!strcmp(arg, "--planar-kv-cache-only")) {
+            c.engine.planar_kv_cache = true;
+            c.engine.planar_kv_cache_only = true;
         } else if (!strcmp(arg, "--warm-weights")) {
             c.engine.warm_weights = true;
         } else if (!strcmp(arg, "--server")) {

diff --git a/ds4_gpu.h b/ds4_gpu.h
@@ -434,7 +434,8 @@ int ds4_gpu_attention_decode_heads_tensor(
         const ds4_gpu_tensor *comp_mask,
         uint32_t                use_mask,
         uint32_t                n_head,
-        uint32_t                head_dim);
+        uint32_t                head_dim,
+        uint32_t                comp_kv_planar);
 
 int ds4_gpu_attention_prefill_raw_heads_tensor(
         ds4_gpu_tensor       *heads,
@@ -484,7 +485,8 @@ int ds4_gpu_attention_decode_mixed_batch_heads_tensor(
         uint32_t                window,
         uint32_t                ratio,
         uint32_t                n_head,
-        uint32_t                head_dim);
+        uint32_t                head_dim,
+        uint32_t                comp_kv_planar);
 
 int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
         ds4_gpu_tensor       *heads,
@@ -495,6 +497,7 @@ int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
         const ds4_gpu_tensor *raw_kv,
         const ds4_gpu_tensor *comp_kv,
         uint32_t                comp_kv_f16,
+        uint32_t                comp_kv_planar,
         const ds4_gpu_tensor *topk,
         uint32_t                n_tokens,
         uint32_t                pos0,
@@ -517,6 +520,7 @@ int ds4_gpu_attention_prefill_static_mixed_heads_tensor(
         const ds4_gpu_tensor *raw_kv,
         const ds4_gpu_tensor *comp_kv,
         uint32_t                comp_kv_f16,
+        uint32_t                comp_kv_planar,
         uint32_t                n_tokens,
         uint32_t                n_comp,
         uint32_t                window,
@@ -575,6 +579,12 @@ int ds4_gpu_attention_output_low_q8_tensor(
  * routing, shared SwiGLU, and the IQ2_XXS/Q2_K/Q4_K routed experts.
  */
 
+int ds4_gpu_planar3_quantize_tensor(
+        const ds4_gpu_tensor *src,
+        ds4_gpu_tensor       *dst,
+        uint32_t              n_rows,
+        uint32_t              head_dim);
+
 int ds4_gpu_swiglu_tensor(
         ds4_gpu_tensor       *out,
         const ds4_gpu_tensor *gate,