diff --git a/Makefile b/Makefile index 795f8cbe1..320e15659 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,25 @@ OBJCFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -fo LDLIBS ?= -lm -pthread METAL_SRCS := $(wildcard metal/*.metal) +LLGUIDANCE ?= 0 +LLGUIDANCE_DIR ?= .deps/llguidance +LLGUIDANCE_REPO ?= https://github.com/guidance-ai/llguidance +LLGUIDANCE_TAG ?= v1.7.5 +SERVER_EXTRA_OBJS := ds4_llguidance.o + +ifeq ($(LLGUIDANCE),1) +ifeq ($(LLGUIDANCE_DIR),.deps/llguidance) +LLGUIDANCE_NEEDS_CLONE := 1 +endif +LLGUIDANCE_LIB := $(LLGUIDANCE_DIR)/target/release/libllguidance.a +LLGUIDANCE_LDLIBS := $(LLGUIDANCE_LIB) +ifneq ($(UNAME_S),Darwin) +LLGUIDANCE_LDLIBS += -ldl +endif +CFLAGS += -DDS4_USE_LLGUIDANCE -I$(LLGUIDANCE_DIR)/parser +LDLIBS += $(LLGUIDANCE_LDLIBS) +DS4_LLGUIDANCE_DEPS := $(LLGUIDANCE_LIB) +endif ifeq ($(UNAME_S),Darwin) METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal @@ -30,10 +49,11 @@ NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NA CORE_OBJS = ds4.o ds4_distributed.o ds4_cuda.o CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas +CUDA_LDLIBS += $(LLGUIDANCE_LDLIBS) METAL_LDLIBS := $(LDLIBS) endif -.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression +.PHONY: all help clean distclean test cpu cuda cuda-spark cuda-generic cuda-regression ifeq ($(UNAME_S),Darwin) all: ds4 ds4-server ds4-bench ds4-eval ds4-agent @@ -41,28 +61,29 @@ all: ds4 ds4-server ds4-bench ds4-eval ds4-agent help: @echo "DS4 build targets:" @echo " make Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent" + @echo " make LLGUIDANCE=1 Build with structured-output constrained decoding" @echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent" @echo " make test Build and run tests" @echo " make clean Remove build outputs" -ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) +ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(CC) $(CFLAGS) -o $@ ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS) -ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) - $(CC) $(CFLAGS) -o $@ ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS) +ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) + $(CC) $(CFLAGS) -o $@ ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS) -ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS) +ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(CC) $(CFLAGS) -o $@ ds4_bench.o ds4_help.o $(CORE_OBJS) $(METAL_LDLIBS) -ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS) +ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(CC) $(CFLAGS) -o $@ ds4_eval.o ds4_help.o $(CORE_OBJS) $(METAL_LDLIBS) -ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) +ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS) -cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS) +cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o ds4_help.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) - $(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS) + $(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) @@ -74,6 +95,7 @@ all: help help: @echo "DS4 build targets:" + @echo " make LLGUIDANCE=1 ... Build with structured-output constrained decoding" @echo " make cuda-spark Build CUDA for DGX Spark / GB10" @echo " make cuda-generic Build CUDA for a generic local CUDA GPU" @echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value" @@ -95,24 +117,24 @@ cuda: fi $(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH="$(CUDA_ARCH)" -ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) +ds4: ds4_cli.o ds4_help.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) +ds4-server: ds4_server.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS) +ds4-bench: ds4_bench.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS) +ds4-eval: ds4_eval.o ds4_help.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) +ds4-agent: ds4_agent.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS) +cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) $(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o ds4_help.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) - $(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS) + $(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o ds4_help.o $(CPU_CORE_OBJS) $(LDLIBS) $(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_help.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS) @@ -133,9 +155,12 @@ ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4.h ds4_help.o: ds4_help.c ds4_help.h $(CC) $(CFLAGS) -c -o $@ ds4_help.c -ds4_server.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h +ds4_server.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h ds4_llguidance.h rax.h $(CC) $(CFLAGS) -c -o $@ ds4_server.c +ds4_llguidance.o: ds4_llguidance.c ds4_llguidance.h ds4.h $(DS4_LLGUIDANCE_DEPS) + $(CC) $(CFLAGS) -c -o $@ ds4_llguidance.c + ds4_bench.o: ds4_bench.c ds4.h ds4_distributed.h ds4_help.h $(CC) $(CFLAGS) -c -o $@ ds4_bench.c @@ -151,7 +176,7 @@ ds4_web.o: ds4_web.c ds4_web.h ds4_kvstore.o: ds4_kvstore.c ds4_kvstore.h ds4.h $(CC) $(CFLAGS) -c -o $@ ds4_kvstore.c -ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h +ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h ds4_llguidance.h rax.h $(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h @@ -169,7 +194,7 @@ ds4_cpu.o: ds4.c ds4.h ds4_distributed.h ds4_gpu.h ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_distributed.h ds4_help.h linenoise.h $(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_cli.c -ds4_server_cpu.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h rax.h +ds4_server_cpu.o: ds4_server.c ds4.h ds4_distributed.h ds4_help.h ds4_kvstore.h ds4_llguidance.h rax.h $(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_server.c ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_distributed.h ds4_help.h @@ -190,11 +215,22 @@ ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o $(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS) -ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) +ifeq ($(LLGUIDANCE),1) +ifeq ($(LLGUIDANCE_NEEDS_CLONE),1) +$(LLGUIDANCE_DIR): + mkdir -p .deps + git clone --depth 1 --branch $(LLGUIDANCE_TAG) $(LLGUIDANCE_REPO) $(LLGUIDANCE_DIR) +endif + +$(LLGUIDANCE_LIB): | $(LLGUIDANCE_DIR) + cargo build --release --package llguidance --manifest-path $(LLGUIDANCE_DIR)/Cargo.toml +endif + +ds4_test: ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS) ifeq ($(UNAME_S),Darwin) - $(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS) + $(CC) $(CFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS) else - $(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS) + $(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(CUDA_LDLIBS) endif test: ds4_test ds4-eval q4k-dot-test @@ -207,3 +243,6 @@ q4k-dot-test: tests/test_q4k_dot.c clean: rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o + +distclean: clean + rm -rf .deps diff --git a/README.md b/README.md index bbc0e76da..a478ef6a4 100644 --- a/README.md +++ b/README.md @@ -635,9 +635,27 @@ tool calls are mapped back to OpenAI tool calls. `/v1/responses` accepts OpenAI Responses-style `input`, `instructions`, `tools`, `tool_choice`, `max_output_tokens`, `temperature`, `top_p`, `stream`, -and `reasoning`. It is the preferred endpoint for Codex CLI. The server keeps -Responses continuations bound to live state when possible, and can fall back to -the same DSML rendering and KV prefix reuse used by chat completions. +`text.format`, and `reasoning`. It is the preferred endpoint for Codex CLI. +The server keeps Responses continuations bound to live state when possible, and +can fall back to the same DSML rendering and KV prefix reuse used by chat +completions. + +Structured outputs are available when the server is built with llguidance: + +```sh +make LLGUIDANCE=1 +``` + +By default, this clones llguidance into `.deps/llguidance` and builds the +static library there. To use an existing checkout instead, pass +`LLGUIDANCE_DIR=/path/to/llguidance`. + +With that build, `/v1/chat/completions` supports +`response_format.type=json_schema`, `json_object`, `regex`, `lark`, and +`llguidance`; `/v1/responses` supports the same modes through `text.format`. +Structured outputs use constrained decoding. If thinking is enabled, the +constraint applies after `` so the final assistant content is structured. +They currently cannot be combined with tools. `/v1/messages` is the Anthropic-compatible endpoint used by Claude Code style clients. It accepts `system`, `messages`, `tools`, `tool_choice`, `max_tokens`, diff --git a/ds4.c b/ds4.c index 39694470d..67aefa528 100644 --- a/ds4.c +++ b/ds4.c @@ -16753,6 +16753,231 @@ static int sample_top_p_min_p( return ids[filtered - 1]; } +static bool sample_mask_allows(const uint32_t *mask, size_t words, uint32_t id) { + if (!mask) return true; + const size_t word = id / 32u; + if (word >= words) return false; + return (mask[word] & (UINT32_C(1) << (id & 31u))) != 0; +} + +static bool sample_filtered_allows( + const uint32_t *allow_mask, + size_t allow_words, + const uint32_t *deny_mask, + size_t deny_words, + uint32_t id) { + return sample_mask_allows(allow_mask, allow_words, id) && + !(deny_mask && sample_mask_allows(deny_mask, deny_words, id)); +} + +static int sample_argmax_filtered( + const float *logits, + uint32_t n_vocab, + const uint32_t *allow_mask, + size_t allow_words, + const uint32_t *deny_mask, + size_t deny_words) { + int best = -1; + float best_v = DS4_NEG_INF; + for (uint32_t i = 0; i < n_vocab; i++) { + if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) { + continue; + } + const float v = logits[i]; + if (best < 0 || v > best_v) { + best_v = v; + best = (int)i; + } + } + return best; +} + +static int sample_full_vocab_filtered( + const float *logits, + uint32_t n_vocab, + float temperature, + float top_p, + float min_p, + const uint32_t *allow_mask, + size_t allow_words, + const uint32_t *deny_mask, + size_t deny_words, + uint64_t *rng) { + float max_logit = DS4_NEG_INF; + int best = -1; + uint32_t finite = 0; + for (uint32_t i = 0; i < n_vocab; i++) { + if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) { + continue; + } + const float v = logits[i]; + if (!isfinite(v)) continue; + finite++; + if (best < 0 || v > max_logit) { + max_logit = v; + best = (int)i; + } + } + if (finite == 0) return sample_argmax_filtered(logits, n_vocab, allow_mask, + allow_words, deny_mask, + deny_words); + + if (top_p >= 1.0f) { + float sum = 0.0f; + const float min_rel = min_p > 0.0f ? min_p : 0.0f; + for (uint32_t i = 0; i < n_vocab; i++) { + if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) { + continue; + } + const float v = logits[i]; + if (!isfinite(v)) continue; + const float p = expf((v - max_logit) / temperature); + if (p < min_rel) continue; + sum += p; + } + if (sum <= 0.0f || !isfinite(sum)) return best; + float r = sample_rng_f32(rng) * sum; + for (uint32_t i = 0; i < n_vocab; i++) { + if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) { + continue; + } + const float v = logits[i]; + if (!isfinite(v)) continue; + const float p = expf((v - max_logit) / temperature); + if (p < min_rel) continue; + r -= p; + if (r <= 0.0f) return (int)i; + } + return best; + } + + sample_candidate *cand = xmalloc((size_t)finite * sizeof(cand[0])); + uint32_t n = 0; + float sum = 0.0f; + for (uint32_t i = 0; i < n_vocab; i++) { + if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) { + continue; + } + const float v = logits[i]; + if (!isfinite(v)) continue; + const float p = expf((v - max_logit) / temperature); + cand[n++] = (sample_candidate){.id = (int)i, .logit = v, .prob = p}; + sum += p; + } + if (sum <= 0.0f || !isfinite(sum)) { + free(cand); + return best; + } + + qsort(cand, n, sizeof(cand[0]), sample_candidate_cmp_desc); + const float min_prob = (cand[0].prob / sum) * (min_p > 0.0f ? min_p : 0.0f); + float filtered_sum = 0.0f; + uint32_t filtered = 0; + for (uint32_t i = 0; i < n; i++) { + const float p = cand[i].prob / sum; + if (i > 0 && p < min_prob) break; + filtered_sum += cand[i].prob; + filtered++; + if (filtered_sum / sum >= top_p) break; + } + if (filtered == 0) { + free(cand); + return best; + } + + float r = sample_rng_f32(rng) * filtered_sum; + for (uint32_t i = 0; i < filtered; i++) { + r -= cand[i].prob; + if (r <= 0.0f) { + const int id = cand[i].id; + free(cand); + return id; + } + } + const int id = cand[filtered - 1].id; + free(cand); + return id; +} + +static int sample_top_p_min_p_filtered( + const float *logits, + uint32_t n_vocab, + float temperature, + int top_k, + float top_p, + float min_p, + const uint32_t *allow_mask, + size_t allow_words, + const uint32_t *deny_mask, + size_t deny_words, + uint64_t *rng) { + if (temperature <= 0.0f) { + return sample_argmax_filtered(logits, n_vocab, allow_mask, allow_words, + deny_mask, deny_words); + } + if (top_p <= 0.0f || top_p > 1.0f) top_p = 1.0f; + if (min_p < 0.0f) min_p = 0.0f; + if (top_k <= 0) { + return sample_full_vocab_filtered(logits, n_vocab, temperature, top_p, + min_p, allow_mask, allow_words, + deny_mask, deny_words, rng); + } + if (top_k > 1024) top_k = 1024; + if ((uint32_t)top_k > n_vocab) top_k = (int)n_vocab; + + int ids[1024]; + float vals[1024]; + int n = 0; + for (uint32_t i = 0; i < n_vocab; i++) { + if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) { + continue; + } + float v = logits[i]; + if (!isfinite(v)) continue; + if (n == top_k && v <= vals[n - 1]) continue; + int j = n < top_k ? n++ : n - 1; + while (j > 0 && vals[j - 1] < v) { + vals[j] = vals[j - 1]; + ids[j] = ids[j - 1]; + j--; + } + vals[j] = v; + ids[j] = (int)i; + } + if (n == 0) { + return sample_argmax_filtered(logits, n_vocab, allow_mask, allow_words, + deny_mask, deny_words); + } + + float probs[1024]; + const float max_logit = vals[0]; + float sum = 0.0f; + for (int i = 0; i < n; i++) { + probs[i] = expf((vals[i] - max_logit) / temperature); + sum += probs[i]; + } + if (sum <= 0.0f || !isfinite(sum)) return ids[0]; + + const float min_prob = (probs[0] / sum) * min_p; + float filtered_sum = 0.0f; + int filtered = 0; + for (int i = 0; i < n; i++) { + float p = probs[i] / sum; + if (i > 0 && p < min_prob) break; + filtered_sum += probs[i]; + filtered++; + if (filtered_sum / sum >= top_p) break; + } + if (filtered <= 0) return ids[0]; + + float r = sample_rng_f32(rng) * filtered_sum; + for (int i = 0; i < filtered; i++) { + r -= probs[i]; + if (r <= 0.0f) return ids[i]; + } + return ids[filtered - 1]; +} + static void print_top_logits( FILE * fp, const char * label, @@ -20361,6 +20586,20 @@ int ds4_session_sample(ds4_session *s, float temperature, int top_k, float top_p return sample_top_p_min_p(s->logits, DS4_N_VOCAB, temperature, top_k, top_p, min_p, rng); } +int ds4_session_sample_masked(ds4_session *s, float temperature, int top_k, + float top_p, float min_p, + const uint32_t *allow_mask, + size_t allow_mask_words, + const uint32_t *deny_mask, + size_t deny_mask_words, + uint64_t *rng) { + if (!s || !s->logits || !allow_mask) return -1; + return sample_top_p_min_p_filtered(s->logits, DS4_N_VOCAB, temperature, + top_k, top_p, min_p, allow_mask, + allow_mask_words, deny_mask, + deny_mask_words, rng); +} + int ds4_session_top_logprobs(ds4_session *s, ds4_token_score *out, int k) { if (!s || !out || k <= 0) return 0; if (k > (int)DS4_N_VOCAB) k = (int)DS4_N_VOCAB; diff --git a/ds4.h b/ds4.h index 7b7233c36..4fed595ae 100644 --- a/ds4.h +++ b/ds4.h @@ -236,6 +236,13 @@ int ds4_session_argmax_excluding(ds4_session *s, int excluded_id); int ds4_sample_logits(const float *logits, int n_vocab, float temperature, int top_k, float top_p, float min_p, uint64_t *rng); int ds4_session_sample(ds4_session *s, float temperature, int top_k, float top_p, float min_p, uint64_t *rng); +int ds4_session_sample_masked(ds4_session *s, float temperature, int top_k, + float top_p, float min_p, + const uint32_t *allow_mask, + size_t allow_mask_words, + const uint32_t *deny_mask, + size_t deny_mask_words, + uint64_t *rng); int ds4_session_top_logprobs(ds4_session *s, ds4_token_score *out, int k); int ds4_session_token_logprob(ds4_session *s, int token, ds4_token_score *out); int ds4_session_copy_logits(ds4_session *s, float *out, int cap); diff --git a/ds4_llguidance.c b/ds4_llguidance.c new file mode 100644 index 000000000..5c6854d2f --- /dev/null +++ b/ds4_llguidance.c @@ -0,0 +1,479 @@ +#include "ds4_llguidance.h" + +#include +#include +#include +#include +#include + +#ifdef DS4_USE_LLGUIDANCE +#include +#include "llguidance.h" +#endif + +#ifndef UINT32_C +#include +#endif + +struct ds4_llguidance { +#ifdef DS4_USE_LLGUIDANCE + LlgTokenizer *tokenizer; + LlgMatcher *matcher; + const uint32_t *leading_ws_mask; + size_t leading_ws_words; + size_t mask_words; + int n_vocab; + int eos_token; + bool deny_leading_ws; + bool started; +#else + int unused; +#endif +}; + +bool ds4_llguidance_available(void) { +#ifdef DS4_USE_LLGUIDANCE + return true; +#else + return false; +#endif +} + +const char *ds4_llguidance_build_info(void) { +#ifdef DS4_USE_LLGUIDANCE + return "llguidance enabled"; +#else + return "llguidance disabled"; +#endif +} + +#ifdef DS4_USE_LLGUIDANCE + +typedef struct { + ds4_engine *engine; + LlgTokenizer *tokenizer; + uint32_t *leading_ws_mask; + size_t leading_ws_words; + int n_vocab; +} ds4_llg_cache; + +static pthread_mutex_t g_llg_cache_mu = PTHREAD_MUTEX_INITIALIZER; +static ds4_llg_cache g_llg_cache = {0}; + +static void set_err(char *err, size_t errlen, const char *fmt, ...) { + if (!err || errlen == 0) return; + va_list ap; + va_start(ap, fmt); + vsnprintf(err, errlen, fmt, ap); + va_end(ap); +} + +static bool json_ws_byte(unsigned char c) { + return c == ' ' || c == '\n' || c == '\r' || c == '\t'; +} + +static bool bytes_all_json_ws(const char *p, size_t len) { + if (!p || len == 0) return false; + for (size_t i = 0; i < len; i++) { + if (!json_ws_byte((unsigned char)p[i])) return false; + } + return true; +} + +static bool bytes_have_non_json_ws(const char *p, size_t len) { + if (!p) return false; + for (size_t i = 0; i < len; i++) { + if (!json_ws_byte((unsigned char)p[i])) return true; + } + return false; +} + +static bool token_text_is_special(const char *p, size_t len) { + static const char *specials[] = { + "<|begin▁of▁sentence|>", + "<|end▁of▁sentence|>", + "<|User|>", + "<|Assistant|>", + "", + "", + "|DSML|", + }; + for (size_t i = 0; i < sizeof(specials) / sizeof(specials[0]); i++) { + size_t n = strlen(specials[i]); + if (len == n && memcmp(p, specials[i], n) == 0) return true; + } + + const unsigned char bar[] = {0xef, 0xbd, 0x9c}; + for (size_t i = 0; i + sizeof(bar) <= len; i++) { + if (!memcmp(p + i, bar, sizeof(bar))) return true; + } + return false; +} + +static bool constraint_uses_json_leading_ws_rule(const char *constraint_type) { + return constraint_type && + (!strcmp(constraint_type, "json") || + !strcmp(constraint_type, "json_schema") || + !strcmp(constraint_type, "json_object")); +} + +static void bitset_set(uint32_t *mask, int token) { + mask[(uint32_t)token / 32u] |= UINT32_C(1) << ((uint32_t)token & 31u); +} + +static bool bitset_get(const uint32_t *mask, size_t words, uint32_t token) { + const size_t word = token / 32u; + if (!mask || word >= words) return false; + return (mask[word] & (UINT32_C(1) << (token & 31u))) != 0; +} + +static bool mask_has_non_denied_token(const uint32_t *allow, + size_t allow_words, + const uint32_t *deny, + size_t deny_words, + int n_vocab) { + if (!allow) return false; + for (int i = 0; i < n_vocab; i++) { + if (bitset_get(allow, allow_words, (uint32_t)i) && + !bitset_get(deny, deny_words, (uint32_t)i)) + { + return true; + } + } + return false; +} + +static size_t ds4_llg_tokenize_fn(const void *user_data, + const uint8_t *bytes, + size_t bytes_len, + uint32_t *output_tokens, + size_t output_tokens_len) { + ds4_engine *e = (ds4_engine *)user_data; + char *text = malloc(bytes_len + 1); + if (!text) return 0; + memcpy(text, bytes, bytes_len); + text[bytes_len] = '\0'; + + ds4_tokens toks = {0}; + ds4_tokenize_text(e, text, &toks); + free(text); + + const size_t n = toks.len < 0 ? 0 : (size_t)toks.len; + const size_t copy = n < output_tokens_len ? n : output_tokens_len; + for (size_t i = 0; i < copy; i++) output_tokens[i] = (uint32_t)toks.v[i]; + ds4_tokens_free(&toks); + return n; +} + +static LlgTokenizer *build_tokenizer(ds4_engine *e, + uint32_t **leading_ws_mask_out, + size_t *leading_ws_words_out, + int *n_vocab_out, + char *err, + size_t errlen) { + const int n_vocab = ds4_engine_vocab_size(e); + if (n_vocab <= 0) { + set_err(err, errlen, "llguidance tokenizer cannot use an empty vocabulary"); + return NULL; + } + + size_t total = 0; + uint32_t *token_lens = calloc((size_t)n_vocab, sizeof(token_lens[0])); + if (!token_lens) { + set_err(err, errlen, "out of memory"); + return NULL; + } + + const size_t mask_words = ((size_t)n_vocab + 31u) / 32u; + uint32_t *leading_ws = calloc(mask_words, sizeof(leading_ws[0])); + if (!leading_ws) { + free(token_lens); + set_err(err, errlen, "out of memory"); + return NULL; + } + + for (int i = 0; i < n_vocab; i++) { + size_t len = 0; + char *piece = ds4_token_text(e, i, &len); + const bool special = token_text_is_special(piece, len); + token_lens[i] = (uint32_t)(len + (special ? 1u : 0u)); + total += token_lens[i]; + if (!special && bytes_all_json_ws(piece, len)) bitset_set(leading_ws, i); + free(piece); + } + + uint8_t *token_bytes = malloc(total ? total : 1); + if (!token_bytes) { + free(leading_ws); + free(token_lens); + set_err(err, errlen, "out of memory"); + return NULL; + } + + size_t off = 0; + for (int i = 0; i < n_vocab; i++) { + size_t len = 0; + char *piece = ds4_token_text(e, i, &len); + if (token_text_is_special(piece, len)) token_bytes[off++] = 0xffu; + memcpy(token_bytes + off, piece, len); + off += len; + free(piece); + } + + LlgTokenizerInit init = {0}; + init.vocab_size = (uint32_t)n_vocab; + init.tok_eos = (uint32_t)ds4_token_eos(e); + init.token_lens = token_lens; + init.token_bytes = token_bytes; + init.tokenize_assumes_string = true; + init.tokenize_fn = ds4_llg_tokenize_fn; + init.use_approximate_greedy_tokenize_fn = false; + init.tokenize_user_data = e; + init.slices = NULL; + + char llg_err[1024] = {0}; + LlgTokenizer *tok = llg_new_tokenizer(&init, llg_err, sizeof(llg_err)); + free(token_bytes); + free(token_lens); + if (!tok) { + free(leading_ws); + set_err(err, errlen, "llguidance tokenizer error: %s", llg_err); + return NULL; + } + + *leading_ws_mask_out = leading_ws; + *leading_ws_words_out = mask_words; + *n_vocab_out = n_vocab; + return tok; +} + +static LlgTokenizer *cached_tokenizer_clone(ds4_engine *e, + const uint32_t **leading_ws_mask_out, + size_t *leading_ws_words_out, + int *n_vocab_out, + char *err, + size_t errlen) { + LlgTokenizer *clone = NULL; + pthread_mutex_lock(&g_llg_cache_mu); + if (g_llg_cache.engine != e || !g_llg_cache.tokenizer) { + if (g_llg_cache.tokenizer) llg_free_tokenizer(g_llg_cache.tokenizer); + free(g_llg_cache.leading_ws_mask); + memset(&g_llg_cache, 0, sizeof(g_llg_cache)); + + uint32_t *leading_ws = NULL; + size_t leading_ws_words = 0; + int n_vocab = 0; + LlgTokenizer *tok = build_tokenizer(e, &leading_ws, &leading_ws_words, + &n_vocab, err, errlen); + if (!tok) { + pthread_mutex_unlock(&g_llg_cache_mu); + return NULL; + } + g_llg_cache.engine = e; + g_llg_cache.tokenizer = tok; + g_llg_cache.leading_ws_mask = leading_ws; + g_llg_cache.leading_ws_words = leading_ws_words; + g_llg_cache.n_vocab = n_vocab; + } + + clone = llg_clone_tokenizer(g_llg_cache.tokenizer); + if (leading_ws_mask_out) *leading_ws_mask_out = g_llg_cache.leading_ws_mask; + if (leading_ws_words_out) *leading_ws_words_out = g_llg_cache.leading_ws_words; + if (n_vocab_out) *n_vocab_out = g_llg_cache.n_vocab; + pthread_mutex_unlock(&g_llg_cache_mu); + if (!clone) set_err(err, errlen, "llguidance tokenizer clone failed"); + return clone; +} + +ds4_llguidance *ds4_llguidance_create(ds4_engine *e, + const char *constraint_type, + const char *constraint_data, + char *err, + size_t errlen) { + if (!e || !constraint_type || !constraint_type[0]) { + set_err(err, errlen, "invalid structured output constraint"); + return NULL; + } + + const uint32_t *leading_ws_mask = NULL; + size_t leading_ws_words = 0; + int n_vocab = 0; + LlgTokenizer *tok = cached_tokenizer_clone(e, &leading_ws_mask, + &leading_ws_words, + &n_vocab, err, errlen); + if (!tok) return NULL; + + LlgConstraintInit init; + llg_constraint_init_set_defaults(&init, tok); + const char *log_level = getenv("LLGUIDANCE_LOG_LEVEL"); + if (!log_level || !log_level[0]) log_level = getenv("DS4_LLGUIDANCE_LOG_LEVEL"); + if (log_level && log_level[0]) init.log_stderr_level = (uint32_t)atoi(log_level); + + LlgMatcher *matcher = llg_new_matcher(&init, constraint_type, + constraint_data ? constraint_data : ""); + const char *llg_err = matcher ? llg_matcher_get_error(matcher) : "allocation failed"; + if (llg_err) { + set_err(err, errlen, "llguidance grammar error: %s", llg_err); + if (matcher) llg_free_matcher(matcher); + llg_free_tokenizer(tok); + return NULL; + } + + const size_t mask_bytes = llg_matcher_get_mask_byte_size(matcher); + const size_t expected = ((size_t)n_vocab + 31u) / 32u * sizeof(uint32_t); + if (mask_bytes != expected) { + set_err(err, errlen, "llguidance mask size mismatch"); + llg_free_matcher(matcher); + llg_free_tokenizer(tok); + return NULL; + } + + ds4_llguidance *g = calloc(1, sizeof(*g)); + if (!g) { + set_err(err, errlen, "out of memory"); + llg_free_matcher(matcher); + llg_free_tokenizer(tok); + return NULL; + } + g->tokenizer = tok; + g->matcher = matcher; + g->leading_ws_mask = leading_ws_mask; + g->leading_ws_words = leading_ws_words; + g->mask_words = mask_bytes / sizeof(uint32_t); + g->n_vocab = n_vocab; + g->eos_token = ds4_token_eos(e); + g->deny_leading_ws = + constraint_uses_json_leading_ws_rule(constraint_type); + g->started = false; + return g; +} + +void ds4_llguidance_free(ds4_llguidance *g) { + if (!g) return; + if (g->matcher) llg_free_matcher(g->matcher); + if (g->tokenizer) llg_free_tokenizer(g->tokenizer); + free(g); +} + +int ds4_llguidance_sample(ds4_llguidance *g, + ds4_session *s, + float temperature, + int top_k, + float top_p, + float min_p, + uint64_t *rng, + char *err, + size_t errlen) { + if (!g || !g->matcher || !s) { + set_err(err, errlen, "structured output decoder is not active"); + return -1; + } + if (llg_matcher_is_stopped(g->matcher)) return g->eos_token; + if (llg_matcher_compute_mask(g->matcher) != 0) { + set_err(err, errlen, "llguidance mask error: %s", + llg_matcher_get_error(g->matcher)); + return -1; + } + const uint32_t *allow = llg_matcher_get_mask(g->matcher); + if (!allow) { + set_err(err, errlen, "llguidance did not return a token mask"); + return -1; + } + + const uint32_t *deny = NULL; + size_t deny_words = 0; + if (g->deny_leading_ws && + !g->started && + mask_has_non_denied_token(allow, g->mask_words, g->leading_ws_mask, + g->leading_ws_words, g->n_vocab)) + { + deny = g->leading_ws_mask; + deny_words = g->leading_ws_words; + } + + int token = ds4_session_sample_masked(s, temperature, top_k, top_p, min_p, + allow, g->mask_words, deny, + deny_words, rng); + if (token < 0) set_err(err, errlen, "llguidance mask allowed no sampleable token"); + return token; +} + +bool ds4_llguidance_accept(ds4_llguidance *g, + ds4_engine *e, + int token, + char *err, + size_t errlen) { + if (!g || !g->matcher) return true; + if (token < 0) return true; + if (llg_matcher_consume_token(g->matcher, (uint32_t)token) != 0) { + set_err(err, errlen, "llguidance consume error: %s", + llg_matcher_get_error(g->matcher)); + return false; + } + if (g->deny_leading_ws && !g->started && e) { + size_t len = 0; + char *piece = ds4_token_text(e, token, &len); + if (bytes_have_non_json_ws(piece, len)) g->started = true; + free(piece); + } + return true; +} + +#else + +ds4_llguidance *ds4_llguidance_create(ds4_engine *e, + const char *constraint_type, + const char *constraint_data, + char *err, + size_t errlen) { + (void)e; + (void)constraint_type; + (void)constraint_data; + if (err && errlen) { + snprintf(err, errlen, + "structured outputs require building ds4 with LLGUIDANCE=1"); + } + return NULL; +} + +void ds4_llguidance_free(ds4_llguidance *g) { + (void)g; +} + +int ds4_llguidance_sample(ds4_llguidance *g, + ds4_session *s, + float temperature, + int top_k, + float top_p, + float min_p, + uint64_t *rng, + char *err, + size_t errlen) { + (void)g; + (void)s; + (void)temperature; + (void)top_k; + (void)top_p; + (void)min_p; + (void)rng; + if (err && errlen) { + snprintf(err, errlen, + "structured outputs require building ds4 with LLGUIDANCE=1"); + } + return -1; +} + +bool ds4_llguidance_accept(ds4_llguidance *g, + ds4_engine *e, + int token, + char *err, + size_t errlen) { + (void)g; + (void)e; + (void)token; + (void)err; + (void)errlen; + return true; +} + +#endif diff --git a/ds4_llguidance.h b/ds4_llguidance.h new file mode 100644 index 000000000..f677f3b13 --- /dev/null +++ b/ds4_llguidance.h @@ -0,0 +1,37 @@ +#ifndef DS4_LLGUIDANCE_H +#define DS4_LLGUIDANCE_H + +#include +#include +#include + +#include "ds4.h" + +typedef struct ds4_llguidance ds4_llguidance; + +bool ds4_llguidance_available(void); +const char *ds4_llguidance_build_info(void); + +ds4_llguidance *ds4_llguidance_create(ds4_engine *e, + const char *constraint_type, + const char *constraint_data, + char *err, + size_t errlen); +void ds4_llguidance_free(ds4_llguidance *g); + +int ds4_llguidance_sample(ds4_llguidance *g, + ds4_session *s, + float temperature, + int top_k, + float top_p, + float min_p, + uint64_t *rng, + char *err, + size_t errlen); +bool ds4_llguidance_accept(ds4_llguidance *g, + ds4_engine *e, + int token, + char *err, + size_t errlen); + +#endif diff --git a/ds4_server.c b/ds4_server.c index f5c96e885..df750a9c8 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -2,6 +2,7 @@ #include "ds4_distributed.h" #include "ds4_help.h" #include "ds4_kvstore.h" +#include "ds4_llguidance.h" #include "rax.h" /* OpenAI/Anthropic compatible local server. @@ -403,6 +404,485 @@ static char *json_minify_raw_value(const char *json) { return buf_take(&b); } +typedef enum { + DS4_TEXT_FORMAT_TEXT, + DS4_TEXT_FORMAT_JSON_OBJECT, + DS4_TEXT_FORMAT_JSON_SCHEMA, + DS4_TEXT_FORMAT_REGEX, + DS4_TEXT_FORMAT_LARK, + DS4_TEXT_FORMAT_LLGUIDANCE, +} ds4_text_format_type; + +typedef struct { + ds4_text_format_type type; + char *name; + char *schema_json; + bool strict; +} ds4_text_format; + +static void ds4_text_format_clear(ds4_text_format *f) { + if (!f) return; + free(f->name); + free(f->schema_json); + memset(f, 0, sizeof(*f)); +} + +static bool ds4_text_format_is_structured(const ds4_text_format *f) { + return f && (f->type == DS4_TEXT_FORMAT_JSON_OBJECT || + f->type == DS4_TEXT_FORMAT_JSON_SCHEMA || + f->type == DS4_TEXT_FORMAT_REGEX || + f->type == DS4_TEXT_FORMAT_LARK || + f->type == DS4_TEXT_FORMAT_LLGUIDANCE); +} + +static void ds4_text_format_set_schema(ds4_text_format *f, + ds4_text_format_type type, + char *name, + char *schema_json, + bool strict) { + ds4_text_format_clear(f); + f->type = type; + f->name = name; + f->schema_json = schema_json; + f->strict = strict; +} + +static void ds4_text_format_set_constraint(ds4_text_format *f, + ds4_text_format_type type, + char *constraint_data) { + ds4_text_format_set_schema(f, type, NULL, constraint_data, false); +} + +static const char *ds4_text_format_constraint_type(const ds4_text_format *f) { + if (!f) return "text"; + if (f->type == DS4_TEXT_FORMAT_JSON_SCHEMA) return "json_schema"; + if (f->type == DS4_TEXT_FORMAT_JSON_OBJECT) { + return f->schema_json ? "json_schema" : "json_object"; + } + if (f->type == DS4_TEXT_FORMAT_REGEX) return "regex"; + if (f->type == DS4_TEXT_FORMAT_LARK) return "lark"; + if (f->type == DS4_TEXT_FORMAT_LLGUIDANCE) return "llguidance"; + return "text"; +} + +static const char *ds4_text_format_constraint_data(const ds4_text_format *f) { + return f && f->schema_json ? f->schema_json : ""; +} + +static bool ds4_text_format_validate_with_llguidance(ds4_engine *e, + const ds4_text_format *f, + char *err, + size_t errlen) { + if (!ds4_text_format_is_structured(f)) return true; + if (!ds4_llguidance_available()) { + snprintf(err, errlen, + "structured outputs require building ds4 with LLGUIDANCE=1"); + return false; + } + + char llg_err[160] = {0}; + ds4_llguidance *g = ds4_llguidance_create( + e, + ds4_text_format_constraint_type(f), + ds4_text_format_constraint_data(f), + llg_err, + sizeof(llg_err)); + if (!g) { + snprintf(err, errlen, "invalid structured output constraint: %s", + llg_err[0] ? llg_err : "llguidance rejected constraint"); + return false; + } + ds4_llguidance_free(g); + return true; +} + +static bool parse_json_schema_wrapper(const char **p, + ds4_text_format *format, + char *err, + size_t errlen) { + json_ws(p); + if (**p != '{') return false; + (*p)++; + char *name = NULL; + char *schema = NULL; + bool strict = false; + json_ws(p); + while (**p && **p != '}') { + char *key = NULL; + if (!json_string(p, &key)) goto bad; + json_ws(p); + if (**p != ':') { + free(key); + goto bad; + } + (*p)++; + if (!strcmp(key, "name")) { + free(name); + if (!json_string(p, &name)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "schema")) { + free(schema); + if (!json_raw_value(p, &schema)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "strict")) { + if (!json_bool(p, &strict)) { + free(key); + goto bad; + } + } else if (!json_skip_value(p)) { + free(key); + goto bad; + } + free(key); + json_ws(p); + if (**p == ',') (*p)++; + json_ws(p); + } + if (**p != '}') goto bad; + (*p)++; + if (!schema) { + snprintf(err, errlen, "json_schema.schema is required"); + free(name); + return false; + } + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA, + name, schema, strict); + return true; +bad: + free(name); + free(schema); + return false; +} + +static bool parse_chat_response_format(const char **p, + ds4_text_format *format, + char *err, + size_t errlen) { + json_ws(p); + if (json_lit(p, "null")) { + ds4_text_format_clear(format); + return true; + } + if (**p != '{') return false; + (*p)++; + + char *type = NULL; + char *schema = NULL; + char *regex = NULL; + char *grammar = NULL; + char *name = NULL; + bool strict = false; + bool saw_json_schema = false; + json_ws(p); + while (**p && **p != '}') { + char *key = NULL; + if (!json_string(p, &key)) goto bad; + json_ws(p); + if (**p != ':') { + free(key); + goto bad; + } + (*p)++; + if (!strcmp(key, "type")) { + free(type); + if (!json_string(p, &type)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "json_schema")) { + saw_json_schema = true; + if (!parse_json_schema_wrapper(p, format, err, errlen)) { + free(key); + goto bad_keep_err; + } + } else if (!strcmp(key, "schema")) { + free(schema); + if (!json_raw_value(p, &schema)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "regex")) { + free(regex); + if (!json_string(p, ®ex)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "grammar")) { + free(grammar); + if (!json_string(p, &grammar)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "name")) { + free(name); + if (!json_string(p, &name)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "strict")) { + if (!json_bool(p, &strict)) { + free(key); + goto bad; + } + } else if (!json_skip_value(p)) { + free(key); + goto bad; + } + free(key); + json_ws(p); + if (**p == ',') (*p)++; + json_ws(p); + } + if (**p != '}') goto bad; + (*p)++; + + if (!type || !strcmp(type, "text")) { + ds4_text_format_clear(format); + } else if (!strcmp(type, "json_object")) { + if (schema) { + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA, + name, schema, strict); + name = NULL; + schema = NULL; + } else { + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_OBJECT, + NULL, NULL, false); + } + } else if (!strcmp(type, "json_schema")) { + if (!saw_json_schema && schema) { + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA, + name, schema, strict); + name = NULL; + schema = NULL; + } else if (!format->schema_json) { + snprintf(err, errlen, "response_format json_schema.schema is required"); + goto bad_keep_err; + } + } else if (!strcmp(type, "regex")) { + if (!regex) { + snprintf(err, errlen, "response_format.regex is required"); + goto bad_keep_err; + } + ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_REGEX, regex); + regex = NULL; + } else if (!strcmp(type, "lark")) { + if (!grammar) { + snprintf(err, errlen, "response_format.grammar is required"); + goto bad_keep_err; + } + ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LARK, grammar); + grammar = NULL; + } else if (!strcmp(type, "llguidance")) { + if (!grammar) { + snprintf(err, errlen, "response_format.grammar is required"); + goto bad_keep_err; + } + ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LLGUIDANCE, grammar); + grammar = NULL; + } else { + snprintf(err, errlen, "response_format.type=%s not supported", type); + goto bad_keep_err; + } + + free(type); + free(name); + free(schema); + free(regex); + free(grammar); + return true; +bad: + snprintf(err, errlen, "invalid response_format"); +bad_keep_err: + free(type); + free(name); + free(schema); + free(regex); + free(grammar); + return false; +} + +static bool parse_responses_text_format_object(const char **p, + ds4_text_format *format, + char *err, + size_t errlen) { + json_ws(p); + if (json_lit(p, "null")) { + ds4_text_format_clear(format); + return true; + } + if (**p != '{') return false; + (*p)++; + char *type = NULL; + char *name = NULL; + char *schema = NULL; + char *regex = NULL; + char *grammar = NULL; + bool strict = false; + json_ws(p); + while (**p && **p != '}') { + char *key = NULL; + if (!json_string(p, &key)) goto bad; + json_ws(p); + if (**p != ':') { + free(key); + goto bad; + } + (*p)++; + if (!strcmp(key, "type")) { + free(type); + if (!json_string(p, &type)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "name")) { + free(name); + if (!json_string(p, &name)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "schema")) { + free(schema); + if (!json_raw_value(p, &schema)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "regex")) { + free(regex); + if (!json_string(p, ®ex)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "grammar")) { + free(grammar); + if (!json_string(p, &grammar)) { + free(key); + goto bad; + } + } else if (!strcmp(key, "strict")) { + if (!json_bool(p, &strict)) { + free(key); + goto bad; + } + } else if (!json_skip_value(p)) { + free(key); + goto bad; + } + free(key); + json_ws(p); + if (**p == ',') (*p)++; + json_ws(p); + } + if (**p != '}') goto bad; + (*p)++; + + if (!type || !strcmp(type, "text")) { + ds4_text_format_clear(format); + } else if (!strcmp(type, "json_object")) { + if (schema) { + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA, + name, schema, strict); + name = NULL; + schema = NULL; + } else { + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_OBJECT, + NULL, NULL, false); + } + } else if (!strcmp(type, "json_schema")) { + if (!schema) { + snprintf(err, errlen, "text.format.schema is required"); + goto bad_keep_err; + } + ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA, + name, schema, strict); + name = NULL; + schema = NULL; + } else if (!strcmp(type, "regex")) { + if (!regex) { + snprintf(err, errlen, "text.format.regex is required"); + goto bad_keep_err; + } + ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_REGEX, regex); + regex = NULL; + } else if (!strcmp(type, "lark")) { + if (!grammar) { + snprintf(err, errlen, "text.format.grammar is required"); + goto bad_keep_err; + } + ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LARK, grammar); + grammar = NULL; + } else if (!strcmp(type, "llguidance")) { + if (!grammar) { + snprintf(err, errlen, "text.format.grammar is required"); + goto bad_keep_err; + } + ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LLGUIDANCE, grammar); + grammar = NULL; + } else { + snprintf(err, errlen, "text.format.type=%s not supported", type); + goto bad_keep_err; + } + + free(type); + free(name); + free(schema); + free(regex); + free(grammar); + return true; +bad: + snprintf(err, errlen, "invalid text.format"); +bad_keep_err: + free(type); + free(name); + free(schema); + free(regex); + free(grammar); + return false; +} + +static bool parse_responses_text_value(const char **p, + ds4_text_format *format, + char *err, + size_t errlen) { + json_ws(p); + if (json_lit(p, "null")) { + ds4_text_format_clear(format); + return true; + } + if (**p != '{') return false; + (*p)++; + json_ws(p); + while (**p && **p != '}') { + char *key = NULL; + if (!json_string(p, &key)) return false; + json_ws(p); + if (**p != ':') { + free(key); + return false; + } + (*p)++; + if (!strcmp(key, "format")) { + if (!parse_responses_text_format_object(p, format, err, errlen)) { + free(key); + return false; + } + } else if (!json_skip_value(p)) { + free(key); + return false; + } + free(key); + json_ws(p); + if (**p == ',') (*p)++; + json_ws(p); + } + if (**p != '}') return false; + (*p)++; + return true; +} + static bool json_content(const char **p, char **out) { json_ws(p); if (**p == '"') return json_string(p, out); @@ -602,6 +1082,7 @@ typedef struct { int cache_read_tokens; int cache_write_tokens; ds4_think_mode think_mode; + ds4_text_format text_format; bool has_tools; bool prompt_preserves_reasoning; /* For /v1/responses: emit reasoning_summary_* events / fields only when the @@ -764,6 +1245,7 @@ static void request_free(request *r) { free(r->stops.v); free(r->raw_body); free(r->prompt_text); + ds4_text_format_clear(&r->text_format); stop_list_clear(&r->responses_live_call_ids); free(r->responses_live_call_ids.v); free(r->responses_live_suffix_text); @@ -2727,6 +3209,15 @@ static bool parse_chat_request(ds4_engine *e, server *s, const char *body, int d free(key); goto bad; } + } else if (!strcmp(key, "response_format")) { + if (!parse_chat_response_format(&p, &r->text_format, err, errlen)) { + free(key); + chat_msgs_free(&msgs); + free(tool_schemas); + if (!err[0]) snprintf(err, errlen, "invalid response_format"); + request_free(r); + return false; + } } else if (!strcmp(key, "thinking")) { if (!parse_thinking_control_value(&p, &thinking_enabled)) { free(key); @@ -2767,6 +3258,23 @@ static bool parse_chat_request(ds4_engine *e, server *s, const char *body, int d return false; } r->has_tools = tool_schemas && tool_schemas[0] && !tool_choice_none; + if (ds4_text_format_is_structured(&r->text_format)) { + if (r->has_tools) { + snprintf(err, errlen, + "structured outputs with tools are not supported"); + chat_msgs_free(&msgs); + free(tool_schemas); + request_free(r); + return false; + } + if (!ds4_text_format_validate_with_llguidance(e, &r->text_format, + err, errlen)) { + chat_msgs_free(&msgs); + free(tool_schemas); + request_free(r); + return false; + } + } if (!got_thinking && model_alias_disables_thinking(r->model)) thinking_enabled = false; if (!got_thinking && model_alias_enables_thinking(r->model)) thinking_enabled = true; r->think_mode = ds4_think_mode_for_context( @@ -3816,6 +4324,17 @@ static bool parse_responses_request(ds4_engine *e, server *s, const char *body, free(key); goto bad; } + } else if (!strcmp(key, "text")) { + if (!parse_responses_text_value(&p, &r->text_format, err, errlen)) { + free(key); + chat_msgs_free(&msgs); + buf_free(&loaded_tool_schemas); + free(instructions); + free(tool_schemas); + if (!err[0]) snprintf(err, errlen, "invalid text"); + request_free(r); + return false; + } } else if (!strcmp(key, "reasoning")) { bool effort_seen = false; if (!parse_responses_reasoning(&p, &reasoning_effort, @@ -3905,6 +4424,29 @@ static bool parse_responses_request(ds4_engine *e, server *s, const char *body, (!tool_choice_none && combined_tool_schemas.len) ? combined_tool_schemas.ptr : NULL; r->has_tools = active_tool_schemas && active_tool_schemas[0]; + if (ds4_text_format_is_structured(&r->text_format)) { + if (r->has_tools) { + snprintf(err, errlen, + "structured outputs with tools are not supported"); + chat_msgs_free(&msgs); + buf_free(&combined_tool_schemas); + buf_free(&loaded_tool_schemas); + free(instructions); + free(tool_schemas); + request_free(r); + return false; + } + if (!ds4_text_format_validate_with_llguidance(e, &r->text_format, + err, errlen)) { + chat_msgs_free(&msgs); + buf_free(&combined_tool_schemas); + buf_free(&loaded_tool_schemas); + free(instructions); + free(tool_schemas); + request_free(r); + return false; + } + } if (!got_thinking && model_alias_disables_thinking(r->model)) thinking_enabled = false; if (!got_thinking && model_alias_enables_thinking(r->model)) thinking_enabled = true; r->think_mode = ds4_think_mode_for_context( @@ -5967,6 +6509,11 @@ static bool request_uses_structured_stream(const request *r) { request_uses_openai_live_stream(r)); } +static bool request_uses_structured_decoder(const request *r) { + return r && r->kind == REQ_CHAT && + ds4_text_format_is_structured(&r->text_format); +} + /* Codex' Responses API uses 24-hex suffixes for response/item ids. Prefix * controls the variant (resp_, rs_, msg_, fc_) so each event references a * stable identifier across output_item.added / .done. */ @@ -9908,6 +10455,7 @@ static bool should_canonicalize_tool_checkpoint(const server *s, const tool_call static void generate_job(server *s, job *j) { char err[160]; err[0] = '\0'; + ds4_llguidance *structured = NULL; const int old_pos = ds4_session_pos(s->session); const int common = ds4_session_common_prefix(s->session, &j->req.prompt); trace_cache_diag cache_diag = {0}; @@ -10065,6 +10613,25 @@ static void generate_job(server *s, job *j) { char req_flags[64]; log_flags(req_flags, sizeof(req_flags), responses_protocol, j->req.has_tools, false, false, false); + if (request_uses_structured_decoder(&j->req)) { + structured = ds4_llguidance_create( + s->engine, + ds4_text_format_constraint_type(&j->req.text_format), + ds4_text_format_constraint_data(&j->req.text_format), + err, + sizeof(err)); + if (!structured) { + ds4_tokens_free(&effective_prompt); + free(disk_cache_path); + trace_event(s, trace_id, "structured output init failed: %s", + err[0] ? err : "unknown error"); + http_error(j->fd, s->enable_cors, 400, + err[0] ? err : "structured output init failed"); + return; + } + trace_event(s, trace_id, "structured output constraint=%s", + ds4_text_format_constraint_type(&j->req.text_format)); + } if (responses_live_continuation) { server_log(DS4_LOG_PREFILL, "ds4-server: responses live continuation RESPPROTO match=%s ids=%d cached=%d prompt=%d", @@ -10150,6 +10717,7 @@ static void generate_job(server *s, job *j) { cold_store_len); kv_cache_discard_failed_disk_entry(s, disk_cache_path); free(disk_cache_path); + ds4_llguidance_free(structured); trace_event(s, trace_id, "prefill failed: %s", err); send_prefill_failure_response(s, j, &progress, ctx_span, req_flags, err); return; @@ -10173,6 +10741,7 @@ static void generate_job(server *s, job *j) { cold_store_len); kv_cache_discard_failed_disk_entry(s, disk_cache_path); free(disk_cache_path); + ds4_llguidance_free(structured); trace_event(s, trace_id, "prefill failed: %s", err); send_prefill_failure_response(s, j, &progress, ctx_span, req_flags, err); return; @@ -10223,6 +10792,7 @@ static void generate_job(server *s, job *j) { req_flags[0] ? " " : "", req_flags); ds4_tokens_free(&effective_prompt); + ds4_llguidance_free(structured); return; } /* The prefill progress callback may have already sent the SSE headers @@ -10236,6 +10806,7 @@ static void generate_job(server *s, job *j) { req_flags[0] ? " " : "", req_flags); ds4_tokens_free(&effective_prompt); + ds4_llguidance_free(structured); return; } progress.headers_sent = true; @@ -10244,12 +10815,14 @@ static void generate_job(server *s, job *j) { prompt_tokens, &anthropic_live)) { server_log(DS4_LOG_GENERATION, "ds4-server: chat ctx=%s anthropic stream start failed", ctx_span); ds4_tokens_free(&effective_prompt); + ds4_llguidance_free(structured); return; } if (j->req.api == API_OPENAI && j->req.kind == REQ_CHAT && !sse_chunk(j->fd, &j->req, id, NULL, NULL)) { server_log(DS4_LOG_GENERATION, "ds4-server: chat ctx=%s openai role chunk failed", ctx_span); ds4_tokens_free(&effective_prompt); + ds4_llguidance_free(structured); return; } if (openai_live_chat) openai_stream_start(&j->req, &openai_live); @@ -10264,6 +10837,7 @@ static void generate_job(server *s, job *j) { req_flags); responses_stream_free(&responses_live); ds4_tokens_free(&effective_prompt); + ds4_llguidance_free(structured); return; } } @@ -10294,6 +10868,11 @@ static void generate_job(server *s, job *j) { double last_decode_log_t = decode_t0; int last_decode_log_completion = 0; thinking_state thinking = thinking_state_from_prompt(&j->req); + bool structured_waiting_for_think_close = structured && thinking.inside; + if (structured_waiting_for_think_close) { + trace_event(s, trace_id, + "structured output constraint delayed until "); + } const bool thinking_gates_tool_markers = ds4_think_mode_enabled(j->req.think_mode); bool tool_scan_waiting_for_think_close = thinking_gates_tool_markers && thinking.inside; @@ -10321,7 +10900,16 @@ static void generate_job(server *s, job *j) { if (in_tool_call && !dsml_decode_state_uses_payload_sampling(dsml_state)) { temperature = 0.0f; } - int token = ds4_session_sample(s->session, temperature, top_k, top_p, min_p, &rng); + bool structured_active = structured && !structured_waiting_for_think_close; + int token = structured_active ? + ds4_llguidance_sample(structured, s->session, + temperature, top_k, top_p, min_p, + &rng, err, sizeof(err)) : + ds4_session_sample(s->session, temperature, top_k, top_p, min_p, &rng); + if (token < 0) { + finish = "error"; + break; + } if (token == ds4_token_eos(s->engine)) { finish = "stop"; break; @@ -10329,7 +10917,9 @@ static void generate_job(server *s, job *j) { int toks[17]; int ntok = 0; - if (temperature <= 0.0f && + if (!structured_active && + !structured_waiting_for_think_close && + temperature <= 0.0f && ds4_engine_mtp_draft_tokens(s->engine) > 1 && getenv("DS4_MTP_SPEC_DISABLE") == NULL) { @@ -10362,6 +10952,14 @@ static void generate_job(server *s, job *j) { stop_decode = true; break; } + structured_active = structured && !structured_waiting_for_think_close; + if (structured_active && + !ds4_llguidance_accept(structured, s->engine, token, + err, sizeof(err))) { + finish = "error"; + stop_decode = true; + break; + } size_t piece_len = 0; char *piece = ds4_token_text(s->engine, token, &piece_len); @@ -10369,7 +10967,16 @@ static void generate_job(server *s, job *j) { trace_piece(s, trace_id, piece, piece_len); buf_append(&text, piece, piece_len); + bool was_thinking_inside = thinking.inside; thinking_state_feed(&thinking, piece, piece_len); + if (structured_waiting_for_think_close && + was_thinking_inside && + !thinking.inside) + { + structured_waiting_for_think_close = false; + trace_event(s, trace_id, + "structured output constraint activated after "); + } if (j->req.kind == REQ_CHAT && j->req.has_tools) { dsml_decode_tracker_update(&dsml_tracker, text.ptr, text.len); } @@ -10917,6 +11524,7 @@ static void generate_job(server *s, job *j) { anthropic_stream_free(&anthropic_live); openai_stream_free(&openai_live); responses_stream_free(&responses_live); + ds4_llguidance_free(structured); buf_free(&text); ds4_tokens_free(&effective_prompt); } @@ -11678,6 +12286,204 @@ static void test_assert(bool cond, const char *file, int line, const char *expr) #define TEST_ASSERT(expr) test_assert((expr), __FILE__, __LINE__, #expr) +static void test_parse_chat_response_format_json_schema(void) { + const char *json = + "{\"type\":\"json_schema\",\"json_schema\":{" + "\"name\":\"calendar_event\",\"strict\":true," + "\"schema\":{\"type\":\"object\",\"properties\":{" + "\"name\":{\"type\":\"string\"}},\"required\":[\"name\"]," + "\"additionalProperties\":false}}}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_chat_response_format(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_SCHEMA); + TEST_ASSERT(fmt.name && !strcmp(fmt.name, "calendar_event")); + TEST_ASSERT(fmt.strict); + TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, "\"additionalProperties\"")); + json_ws(&p); + TEST_ASSERT(*p == '\0'); + + ds4_text_format_clear(&fmt); +} + +static void test_parse_chat_response_format_json_object(void) { + const char *json = "{\"type\":\"json_object\"}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_chat_response_format(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_OBJECT); + TEST_ASSERT(fmt.schema_json == NULL); + TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), "json_object")); + + ds4_text_format_clear(&fmt); +} + +static void test_parse_chat_response_format_llguidance_extensions(void) { + const struct { + const char *json; + ds4_text_format_type type; + const char *constraint_type; + const char *needle; + } cases[] = { + { + "{\"type\":\"regex\",\"regex\":\"INV-[0-9]{4}\"}", + DS4_TEXT_FORMAT_REGEX, + "regex", + "INV-" + }, + { + "{\"type\":\"lark\",\"grammar\":\"%llguidance {}\\nstart: /OK/\"}", + DS4_TEXT_FORMAT_LARK, + "lark", + "start:" + }, + { + "{\"type\":\"llguidance\",\"grammar\":\"{\\\"grammars\\\":[]}\"}", + DS4_TEXT_FORMAT_LLGUIDANCE, + "llguidance", + "grammars" + }, + }; + + for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + const char *p = cases[i].json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_chat_response_format(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == cases[i].type); + TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), + cases[i].constraint_type)); + TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, cases[i].needle)); + json_ws(&p); + TEST_ASSERT(*p == '\0'); + + ds4_text_format_clear(&fmt); + } +} + +static void test_parse_chat_response_format_rejects_missing_schema(void) { + const char *json = "{\"type\":\"json_schema\",\"json_schema\":{\"name\":\"bad\"}}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(!parse_chat_response_format(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(strstr(err, "schema is required") != NULL); + + ds4_text_format_clear(&fmt); +} + +static void test_parse_responses_text_format_json_schema(void) { + const char *json = + "{\"format\":{\"type\":\"json_schema\"," + "\"name\":\"calendar_event\",\"strict\":true," + "\"schema\":{\"type\":\"object\",\"properties\":{" + "\"date\":{\"type\":\"string\"}},\"required\":[\"date\"]," + "\"additionalProperties\":false}}}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_SCHEMA); + TEST_ASSERT(fmt.name && !strcmp(fmt.name, "calendar_event")); + TEST_ASSERT(fmt.strict); + TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, "\"required\"")); + TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), "json_schema")); + json_ws(&p); + TEST_ASSERT(*p == '\0'); + + ds4_text_format_clear(&fmt); +} + +static void test_parse_responses_text_format_json_object(void) { + const char *json = "{\"format\":{\"type\":\"json_object\"}}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_OBJECT); + TEST_ASSERT(fmt.schema_json == NULL); + TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), "json_object")); + + ds4_text_format_clear(&fmt); +} + +static void test_parse_responses_text_format_llguidance_extensions(void) { + const struct { + const char *json; + ds4_text_format_type type; + const char *constraint_type; + const char *needle; + } cases[] = { + { + "{\"format\":{\"type\":\"regex\",\"regex\":\"INV-[0-9]{4}\"}}", + DS4_TEXT_FORMAT_REGEX, + "regex", + "INV-" + }, + { + "{\"format\":{\"type\":\"lark\",\"grammar\":\"%llguidance {}\\nstart: /OK/\"}}", + DS4_TEXT_FORMAT_LARK, + "lark", + "start:" + }, + { + "{\"format\":{\"type\":\"llguidance\",\"grammar\":\"{\\\"grammars\\\":[]}\"}}", + DS4_TEXT_FORMAT_LLGUIDANCE, + "llguidance", + "grammars" + }, + }; + + for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { + const char *p = cases[i].json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == cases[i].type); + TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), + cases[i].constraint_type)); + TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, cases[i].needle)); + json_ws(&p); + TEST_ASSERT(*p == '\0'); + + ds4_text_format_clear(&fmt); + } +} + +static void test_parse_responses_text_format_rejects_unknown_type(void) { + const char *json = "{\"format\":{\"type\":\"xml\"}}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(!parse_responses_text_value(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(strstr(err, "not supported") != NULL); + + ds4_text_format_clear(&fmt); +} + +static void test_parse_responses_text_format_text_is_noop(void) { + const char *json = "{\"format\":{\"type\":\"text\"}}"; + const char *p = json; + ds4_text_format fmt = {0}; + char err[160] = {0}; + + TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err))); + TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_TEXT); + TEST_ASSERT(fmt.schema_json == NULL); + + ds4_text_format_clear(&fmt); +} + static void test_tool_schema_order_from_anthropic_schema(void) { tool_schema_orders orders = {0}; tool_schema_orders_add_json(&orders, @@ -15465,6 +16271,15 @@ static void ds4_server_unit_tests_run(void) { test_render_drops_old_reasoning_without_tools(); test_render_preserves_reasoning_with_tools(); test_render_chat_prompt_text_renders_tools_before_system(); + test_parse_chat_response_format_json_schema(); + test_parse_chat_response_format_json_object(); + test_parse_chat_response_format_llguidance_extensions(); + test_parse_chat_response_format_rejects_missing_schema(); + test_parse_responses_text_format_json_schema(); + test_parse_responses_text_format_json_object(); + test_parse_responses_text_format_llguidance_extensions(); + test_parse_responses_text_format_rejects_unknown_type(); + test_parse_responses_text_format_text_is_noop(); test_tool_schema_order_from_anthropic_schema(); test_tool_schema_order_from_openai_tools(); test_tool_schema_order_from_responses_tool_search();