Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ METAL_SRCS := $(wildcard metal/*.metal)

ifeq ($(UNAME_S),Darwin)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
CORE_OBJS = ds4.o ds4_metal.o
CPU_CORE_OBJS = ds4_cpu.o
CORE_OBJS = ds4.o ds4_metal.o ds4_planar_quant.o
CPU_CORE_OBJS = ds4_cpu.o ds4_planar_quant.o
else
CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
CUDA_HOME ?= /usr/local/cuda
Expand All @@ -28,12 +28,12 @@ NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
endif
NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
CORE_OBJS = ds4.o ds4_cuda.o
CPU_CORE_OBJS = ds4_cpu.o
CORE_OBJS = ds4.o ds4_cuda.o ds4_planar_quant.o
CPU_CORE_OBJS = ds4_cpu.o ds4_planar_quant.o
METAL_LDLIBS := $(LDLIBS)
endif

.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
.PHONY: all help clean test planar-quant-test planar-eval cpu cuda cuda-spark cuda-generic cuda-regression

ifeq ($(UNAME_S),Darwin)
all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
Expand Down Expand Up @@ -154,6 +154,21 @@ tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
rax.o: rax.c rax.h rax_malloc.h
$(CC) $(CFLAGS) -c -o $@ rax.c

ds4_planar_quant.o: ds4_planar_quant.c ds4_planar_quant.h
$(CC) $(CFLAGS) -c -o $@ ds4_planar_quant.c

tests/planar_quant_test: tests/planar_quant_test.c ds4_planar_quant.c ds4_planar_quant.h
$(CC) $(CFLAGS) -I. -o $@ tests/planar_quant_test.c ds4_planar_quant.c $(LDLIBS)

planar-quant-test: tests/planar_quant_test
./tests/planar_quant_test

tools/planar_eval: tools/planar_eval.c ds4_planar_quant.c ds4_planar_quant.h
$(CC) $(CFLAGS) -I. -o $@ tools/planar_eval.c ds4_planar_quant.c $(LDLIBS)

planar-eval: tools/planar_eval
./tools/planar_eval --mode ds4_like --rows 10000 --queries 8

linenoise.o: linenoise.c linenoise.h
$(CC) $(CFLAGS) -c -o $@ linenoise.c

Expand Down Expand Up @@ -191,9 +206,9 @@ else
$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
endif

test: ds4_test ds4-eval
test: ds4_test ds4-eval planar-quant-test
./ds4-eval --self-test-extractors
./ds4_test

clean:
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/planar_quant_test tools/planar_eval
359 changes: 292 additions & 67 deletions ds4.c

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions ds4.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ typedef struct {
bool warm_weights;
bool quality;
bool inspect_only;
const char *dump_comp_kv;
bool planar_kv_cache;
bool planar_kv_cache_only;
} ds4_engine_options;

typedef void (*ds4_token_emit_fn)(void *ud, int token);
Expand Down Expand Up @@ -201,6 +204,7 @@ void ds4_session_invalidate(ds4_session *s);
void ds4_session_rewind(ds4_session *s, int pos);
int ds4_session_pos(ds4_session *s);
int ds4_session_ctx(ds4_session *s);
int ds4_session_dump_comp_kv(ds4_session *s, const char *path, char *err, size_t errlen);
int ds4_engine_routed_quant_bits(ds4_engine *e);
bool ds4_engine_has_mtp(ds4_engine *e);
int ds4_engine_mtp_draft_tokens(ds4_engine *e);
Expand Down
23 changes: 23 additions & 0 deletions ds4_cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ static void usage(FILE *fp) {
"Diagnostics:\n"
" --inspect\n"
" Load the model and print a summary only.\n"
" --dump-comp-kv FILE\n"
" Dump compressed KV rows (binary) after prefill for offline Planar3 eval.\n"
" --planar-kv-cache\n"
" Enable Planar3 quantization for compressed attention KV cache.\n"
" --planar-kv-cache-only\n"
" Only keep Planar3 compressed KV cache, skip FP16/FP32 allocation.\n"
" Implies --planar-kv-cache. Saves memory at the cost of dequant overhead.\n"
" --dump-tokens\n"
" Tokenize -p/--prompt-file exactly as written, then exit without inference.\n"
" --dump-logits FILE\n"
Expand Down Expand Up @@ -509,6 +516,15 @@ static int run_sampled_generation(ds4_engine *engine, const cli_config *cfg, con
}
ds4_session_set_progress(session, NULL, NULL);
ds4_session_set_display_progress(session, NULL, NULL);

if (cfg->engine.dump_comp_kv) {
if (ds4_session_dump_comp_kv(session, cfg->engine.dump_comp_kv, err, sizeof(err)) != 0) {
fprintf(stderr, "ds4: compressed KV dump failed: %s\n", err);
ds4_session_free(session);
return 1;
}
}

const double t_prefill1 = cli_now_sec();

int max_tokens = cfg->gen.n_predict;
Expand Down Expand Up @@ -1537,6 +1553,13 @@ static cli_config parse_options(int argc, char **argv) {
exit(2);
} else if (!strcmp(arg, "--inspect")) {
c.inspect = true;
} else if (!strcmp(arg, "--dump-comp-kv")) {
c.engine.dump_comp_kv = need_arg(&i, argc, argv, arg);
} else if (!strcmp(arg, "--planar-kv-cache")) {
c.engine.planar_kv_cache = true;
} else if (!strcmp(arg, "--planar-kv-cache-only")) {
c.engine.planar_kv_cache = true;
c.engine.planar_kv_cache_only = true;
} else if (!strcmp(arg, "--warm-weights")) {
c.engine.warm_weights = true;
} else if (!strcmp(arg, "--server")) {
Expand Down
14 changes: 12 additions & 2 deletions ds4_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,8 @@ int ds4_gpu_attention_decode_heads_tensor(
const ds4_gpu_tensor *comp_mask,
uint32_t use_mask,
uint32_t n_head,
uint32_t head_dim);
uint32_t head_dim,
uint32_t comp_kv_planar);

int ds4_gpu_attention_prefill_raw_heads_tensor(
ds4_gpu_tensor *heads,
Expand Down Expand Up @@ -484,7 +485,8 @@ int ds4_gpu_attention_decode_mixed_batch_heads_tensor(
uint32_t window,
uint32_t ratio,
uint32_t n_head,
uint32_t head_dim);
uint32_t head_dim,
uint32_t comp_kv_planar);

int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
ds4_gpu_tensor *heads,
Expand All @@ -495,6 +497,7 @@ int ds4_gpu_attention_indexed_mixed_batch_heads_tensor(
const ds4_gpu_tensor *raw_kv,
const ds4_gpu_tensor *comp_kv,
uint32_t comp_kv_f16,
uint32_t comp_kv_planar,
const ds4_gpu_tensor *topk,
uint32_t n_tokens,
uint32_t pos0,
Expand All @@ -517,6 +520,7 @@ int ds4_gpu_attention_prefill_static_mixed_heads_tensor(
const ds4_gpu_tensor *raw_kv,
const ds4_gpu_tensor *comp_kv,
uint32_t comp_kv_f16,
uint32_t comp_kv_planar,
uint32_t n_tokens,
uint32_t n_comp,
uint32_t window,
Expand Down Expand Up @@ -575,6 +579,12 @@ int ds4_gpu_attention_output_low_q8_tensor(
* routing, shared SwiGLU, and the IQ2_XXS/Q2_K/Q4_K routed experts.
*/

int ds4_gpu_planar3_quantize_tensor(
const ds4_gpu_tensor *src,
ds4_gpu_tensor *dst,
uint32_t n_rows,
uint32_t head_dim);

int ds4_gpu_swiglu_tensor(
ds4_gpu_tensor *out,
const ds4_gpu_tensor *gate,
Expand Down
Loading