Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 61 additions & 22 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,25 @@ OBJCFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -fo

LDLIBS ?= -lm -pthread
METAL_SRCS := $(wildcard metal/*.metal)
LLGUIDANCE ?= 0
LLGUIDANCE_DIR ?= .deps/llguidance
LLGUIDANCE_REPO ?= https://github.com/guidance-ai/llguidance
LLGUIDANCE_TAG ?= v1.7.5
SERVER_EXTRA_OBJS := ds4_llguidance.o

ifeq ($(LLGUIDANCE),1)
ifeq ($(LLGUIDANCE_DIR),.deps/llguidance)
LLGUIDANCE_NEEDS_CLONE := 1
endif
LLGUIDANCE_LIB := $(LLGUIDANCE_DIR)/target/release/libllguidance.a
LLGUIDANCE_LDLIBS := $(LLGUIDANCE_LIB)
ifneq ($(UNAME_S),Darwin)
LLGUIDANCE_LDLIBS += -ldl
endif
CFLAGS += -DDS4_USE_LLGUIDANCE -I$(LLGUIDANCE_DIR)/parser
LDLIBS += $(LLGUIDANCE_LDLIBS)
DS4_LLGUIDANCE_DEPS := $(LLGUIDANCE_LIB)
endif

ifeq ($(UNAME_S),Darwin)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
Expand All @@ -30,39 +49,41 @@ NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NA
CORE_OBJS = ds4.o ds4_distributed.o ds4_cuda.o
CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
CUDA_LDLIBS += $(LLGUIDANCE_LDLIBS)
METAL_LDLIBS := $(LDLIBS)
endif

.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
.PHONY: all help clean distclean test cpu cuda cuda-spark cuda-generic cuda-regression

ifeq ($(UNAME_S),Darwin)
all: ds4 ds4-server ds4-bench ds4-eval ds4-agent

help:
@echo "DS4 build targets:"
@echo " make Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@echo " make LLGUIDANCE=1 Build with structured-output constrained decoding"
@echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@echo " make test Build and run tests"
@echo " make clean Remove build outputs"

ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS)
ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o $@ ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)

ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o $@ ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS)

ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS)
ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o $@ ds4_bench.o ds4_help.o $(CORE_OBJS) $(METAL_LDLIBS)

ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS)
ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o $@ ds4_eval.o ds4_help.o $(CORE_OBJS) $(METAL_LDLIBS)

ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)

cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o ds4_help.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
Expand All @@ -74,6 +95,7 @@ all: help

help:
@echo "DS4 build targets:"
@echo " make LLGUIDANCE=1 ... Build with structured-output constrained decoding"
@echo " make cuda-spark Build CUDA for DGX Spark / GB10"
@echo " make cuda-generic Build CUDA for a generic local CUDA GPU"
@echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
Expand All @@ -95,24 +117,24 @@ cuda:
fi
$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH="$(CUDA_ARCH)"

ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS)
ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS)
ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS)
ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS)
ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o ds4_help.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
Expand All @@ -133,9 +155,12 @@ ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4.h
ds4_help.o: ds4_help.c ds4_help.h
$(CC) $(CFLAGS) -c -o $@ ds4_help.c

ds4_server.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
ds4_server.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h ds4_llguidance.h rax.h
$(CC) $(CFLAGS) -c -o $@ ds4_server.c

ds4_llguidance.o: ds4_llguidance.c ds4_llguidance.h ds4.h $(DS4_LLGUIDANCE_DEPS)
$(CC) $(CFLAGS) -c -o $@ ds4_llguidance.c

ds4_bench.o: ds4_bench.c ds4.h ds4_distributed.h ds4_help.h
$(CC) $(CFLAGS) -c -o $@ ds4_bench.c

Expand All @@ -151,7 +176,7 @@ ds4_web.o: ds4_web.c ds4_web.h
ds4_kvstore.o: ds4_kvstore.c ds4_kvstore.h ds4.h
$(CC) $(CFLAGS) -c -o $@ ds4_kvstore.c

ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h ds4_llguidance.h rax.h
$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c

tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
Expand All @@ -169,7 +194,7 @@ ds4_cpu.o: ds4.c ds4.h ds4_distributed.h ds4_gpu.h
ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_distributed.h ds4_help.h linenoise.h
$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_cli.c

ds4_server_cpu.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h
ds4_server_cpu.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h ds4_llguidance.h rax.h
$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_server.c

ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_distributed.h ds4_help.h
Expand All @@ -190,11 +215,22 @@ ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS)
ifeq ($(LLGUIDANCE),1)
ifeq ($(LLGUIDANCE_NEEDS_CLONE),1)
$(LLGUIDANCE_DIR):
mkdir -p .deps
git clone --depth 1 --branch $(LLGUIDANCE_TAG) $(LLGUIDANCE_REPO) $(LLGUIDANCE_DIR)
endif

$(LLGUIDANCE_LIB): | $(LLGUIDANCE_DIR)
cargo build --release --package llguidance --manifest-path $(LLGUIDANCE_DIR)/Cargo.toml
endif

ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
ifeq ($(UNAME_S),Darwin)
$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS)
else
$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(CUDA_LDLIBS)
endif

test: ds4_test ds4-eval q4k-dot-test
Expand All @@ -207,3 +243,6 @@ q4k-dot-test: tests/test_q4k_dot.c

clean:
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o

distclean: clean
rm -rf .deps
24 changes: 21 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -635,9 +635,27 @@ tool calls are mapped back to OpenAI tool calls.

`/v1/responses` accepts OpenAI Responses-style `input`, `instructions`,
`tools`, `tool_choice`, `max_output_tokens`, `temperature`, `top_p`, `stream`,
and `reasoning`. It is the preferred endpoint for Codex CLI. The server keeps
Responses continuations bound to live state when possible, and can fall back to
the same DSML rendering and KV prefix reuse used by chat completions.
`text.format`, and `reasoning`. It is the preferred endpoint for Codex CLI.
The server keeps Responses continuations bound to live state when possible, and
can fall back to the same DSML rendering and KV prefix reuse used by chat
completions.

Structured outputs are available when the server is built with llguidance:

```sh
make LLGUIDANCE=1
```

By default, this clones llguidance into `.deps/llguidance` and builds the
static library there. To use an existing checkout instead, pass
`LLGUIDANCE_DIR=/path/to/llguidance`.

With that build, `/v1/chat/completions` supports
`response_format.type=json_schema`, `json_object`, `regex`, `lark`, and
`llguidance`; `/v1/responses` supports the same modes through `text.format`.
Structured outputs use constrained decoding. If thinking is enabled, the
constraint applies after `</think>` so the final assistant content is structured.
They currently cannot be combined with tools.

`/v1/messages` is the Anthropic-compatible endpoint used by Claude Code style
clients. It accepts `system`, `messages`, `tools`, `tool_choice`, `max_tokens`,
Expand Down
Loading