From 5845414199e2d564b7b1474ebc777b97e86e9ef8 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Fri, 29 May 2026 18:43:53 +0100
Subject: [PATCH 1/9] Add llguidance structured outputs

---
 Makefile                           |  66 ++-
 README.md                          |  20 +-
 ds4.c                              | 239 +++++++++++
 ds4.h                              |   7 +
 ds4_llguidance.c                   | 468 ++++++++++++++++++++++
 ds4_llguidance.h                   |  37 ++
 ds4_server.c                       | 623 ++++++++++++++++++++++++++++-
 tests/structured_outputs_stress.py | 424 ++++++++++++++++++++
 8 files changed, 1866 insertions(+), 18 deletions(-)
 create mode 100644 ds4_llguidance.c
 create mode 100644 ds4_llguidance.h
 create mode 100755 tests/structured_outputs_stress.py

diff --git a/Makefile b/Makefile
index 694faf955..9cad30654 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,29 @@ OBJCFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -fo
 
 LDLIBS ?= -lm -pthread
 METAL_SRCS := $(wildcard metal/*.metal)
+LLGUIDANCE ?= 0
+LLGUIDANCE_REPO ?= https://github.com/guidance-ai/llguidance
+LLGUIDANCE_TAG ?= v1.7.5
+SERVER_EXTRA_OBJS := ds4_llguidance.o
+
+ifeq ($(LLGUIDANCE),1)
+ifeq ($(strip $(LLGUIDANCE_DIR)),)
+ifneq ($(wildcard ../../llguidance/parser/llguidance.h),)
+LLGUIDANCE_DIR := ../../llguidance
+else
+LLGUIDANCE_DIR := .deps/llguidance
+LLGUIDANCE_NEEDS_CLONE := 1
+endif
+endif
+LLGUIDANCE_LIB := $(LLGUIDANCE_DIR)/target/release/libllguidance.a
+LLGUIDANCE_LDLIBS := $(LLGUIDANCE_LIB)
+ifneq ($(UNAME_S),Darwin)
+LLGUIDANCE_LDLIBS += -ldl
+endif
+CFLAGS += -DDS4_USE_LLGUIDANCE -I$(LLGUIDANCE_DIR)/parser
+LDLIBS += $(LLGUIDANCE_LDLIBS)
+DS4_LLGUIDANCE_DEPS := $(LLGUIDANCE_LIB)
+endif
 
 ifeq ($(UNAME_S),Darwin)
 METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
@@ -31,6 +54,7 @@ CUDA_SPARK_FLAGS := -DDS4_CUDA_SPARK_HBM_CACHE=1
 CORE_OBJS = ds4.o ds4_distributed.o ds4_cuda.o
 CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
+CUDA_LDLIBS += $(LLGUIDANCE_LDLIBS)
 METAL_LDLIBS := $(LDLIBS)
 endif
 
@@ -42,6 +66,7 @@ all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
 help:
 	@echo "DS4 build targets:"
 	@echo "  make              Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
+	@echo "  make LLGUIDANCE=1 Build with structured-output constrained decoding"
 	@echo "  make cpu          Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
 	@echo "  make test         Build and run tests"
 	@echo "  make clean        Remove build outputs"
@@ -49,8 +74,8 @@ help:
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
 	$(CC) $(CFLAGS) -o $@ ds4_cli.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-ds4-server: ds4_server.o ds4_kvstore.o rax.o $(CORE_OBJS)
-	$(CC) $(CFLAGS) -o $@ ds4_server.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
+ds4-server: ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS)
+	$(CC) $(CFLAGS) -o $@ ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS)
 
 ds4-bench: ds4_bench.o $(CORE_OBJS)
 	$(CC) $(CFLAGS) -o $@ ds4_bench.o $(CORE_OBJS) $(METAL_LDLIBS)
@@ -61,9 +86,9 @@ ds4-eval: ds4_eval.o $(CORE_OBJS)
 ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
 	$(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
+cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
-	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
+	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -76,6 +101,7 @@ all: help
 help:
 	@echo "DS4 build targets:"
 	@echo "  make cuda-spark          Build CUDA for DGX Spark / GB10 with Spark HBM weight cache"
+	@echo "  make LLGUIDANCE=1 ...    Build with structured-output constrained decoding"
 	@echo "  make cuda-generic        Build CUDA for a generic local CUDA GPU"
 	@echo "  make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
 	@echo "  make cpu                 Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@@ -99,7 +125,7 @@ cuda:
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4-server: ds4_server.o ds4_kvstore.o rax.o $(CORE_OBJS)
+ds4-server: ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
 ds4-bench: ds4_bench.o $(CORE_OBJS)
@@ -111,9 +137,9 @@ ds4-eval: ds4_eval.o $(CORE_OBJS)
 ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(CPU_CORE_OBJS)
+cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
-	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
+	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-eval ds4_eval_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-agent ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -131,9 +157,12 @@ ds4_cli.o: ds4_cli.c ds4.h ds4_distributed.h linenoise.h
 ds4_distributed.o: ds4_distributed.c ds4_distributed.h ds4.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_distributed.c
 
-ds4_server.o: ds4_server.c ds4.h ds4_distributed.h ds4_kvstore.h rax.h
+ds4_server.o: ds4_server.c ds4.h ds4_distributed.h ds4_kvstore.h ds4_llguidance.h rax.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_server.c
 
+ds4_llguidance.o: ds4_llguidance.c ds4_llguidance.h ds4.h $(DS4_LLGUIDANCE_DEPS)
+	$(CC) $(CFLAGS) -c -o $@ ds4_llguidance.c
+
 ds4_bench.o: ds4_bench.c ds4.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_bench.c
 
@@ -149,7 +178,7 @@ ds4_web.o: ds4_web.c ds4_web.h
 ds4_kvstore.o: ds4_kvstore.c ds4_kvstore.h ds4.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_kvstore.c
 
-ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_distributed.h ds4_kvstore.h rax.h
+ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h ds4_distributed.h ds4_kvstore.h ds4_llguidance.h rax.h
 	$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c
 
 tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
@@ -167,7 +196,7 @@ ds4_cpu.o: ds4.c ds4.h ds4_distributed.h ds4_gpu.h
 ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_distributed.h linenoise.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_cli.c
 
-ds4_server_cpu.o: ds4_server.c ds4.h ds4_distributed.h ds4_kvstore.h rax.h
+ds4_server_cpu.o: ds4_server.c ds4.h ds4_distributed.h ds4_kvstore.h ds4_llguidance.h rax.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4_server.c
 
 ds4_bench_cpu.o: ds4_bench.c ds4.h
@@ -188,11 +217,22 @@ ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4_test: ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS)
+ifeq ($(LLGUIDANCE),1)
+ifeq ($(LLGUIDANCE_NEEDS_CLONE),1)
+$(LLGUIDANCE_DIR):
+	mkdir -p .deps
+	git clone --depth 1 --branch $(LLGUIDANCE_TAG) $(LLGUIDANCE_REPO) $(LLGUIDANCE_DIR)
+endif
+
+$(LLGUIDANCE_LIB): | $(LLGUIDANCE_DIR)
+	cargo build --release --package llguidance --manifest-path $(LLGUIDANCE_DIR)/Cargo.toml
+endif
+
+ds4_test: ds4_test.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS)
 ifeq ($(UNAME_S),Darwin)
-	$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
+	$(CC) $(CFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS)
 else
-	$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
+	$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(CUDA_LDLIBS)
 endif
 
 test: ds4_test ds4-eval
diff --git a/README.md b/README.md
index bbc0e76da..a4aace8b5 100644
--- a/README.md
+++ b/README.md
@@ -635,9 +635,22 @@ tool calls are mapped back to OpenAI tool calls.
 
 `/v1/responses` accepts OpenAI Responses-style `input`, `instructions`,
 `tools`, `tool_choice`, `max_output_tokens`, `temperature`, `top_p`, `stream`,
-and `reasoning`. It is the preferred endpoint for Codex CLI. The server keeps
-Responses continuations bound to live state when possible, and can fall back to
-the same DSML rendering and KV prefix reuse used by chat completions.
+`text.format`, and `reasoning`. It is the preferred endpoint for Codex CLI.
+The server keeps Responses continuations bound to live state when possible, and
+can fall back to the same DSML rendering and KV prefix reuse used by chat
+completions.
+
+Structured outputs are available when the server is built with llguidance:
+
+```sh
+make LLGUIDANCE=1
+```
+
+With that build, `/v1/chat/completions` supports
+`response_format.type=json_schema` and `response_format.type=json_object`;
+`/v1/responses` supports the same modes through `text.format`. Structured
+outputs use constrained decoding, disable thinking for that turn, and currently
+cannot be combined with tools.
 
 `/v1/messages` is the Anthropic-compatible endpoint used by Claude Code style
 clients. It accepts `system`, `messages`, `tools`, `tool_choice`, `max_tokens`,
@@ -1133,6 +1146,7 @@ extractor self-test run first:
 make test                  # ./ds4-eval --self-test-extractors && ./ds4_test --all
 ./ds4_test --logprob-vectors
 ./ds4_test --server
+python3 tests/structured_outputs_stress.py --base-url http://127.0.0.1:8000/v1 --model ds4 --apis chat,responses
 ```
 
 ## Debugging Notes
diff --git a/ds4.c b/ds4.c
index 0953864ae..2913779a0 100644
--- a/ds4.c
+++ b/ds4.c
@@ -16143,6 +16143,231 @@ static int sample_top_p_min_p(
     return ids[filtered - 1];
 }
 
+static bool sample_mask_allows(const uint32_t *mask, size_t words, uint32_t id) {
+    if (!mask) return true;
+    const size_t word = id / 32u;
+    if (word >= words) return false;
+    return (mask[word] & (UINT32_C(1) << (id & 31u))) != 0;
+}
+
+static bool sample_filtered_allows(
+        const uint32_t *allow_mask,
+        size_t          allow_words,
+        const uint32_t *deny_mask,
+        size_t          deny_words,
+        uint32_t        id) {
+    return sample_mask_allows(allow_mask, allow_words, id) &&
+           !(deny_mask && sample_mask_allows(deny_mask, deny_words, id));
+}
+
+static int sample_argmax_filtered(
+        const float    *logits,
+        uint32_t        n_vocab,
+        const uint32_t *allow_mask,
+        size_t          allow_words,
+        const uint32_t *deny_mask,
+        size_t          deny_words) {
+    int best = -1;
+    float best_v = DS4_NEG_INF;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) {
+            continue;
+        }
+        const float v = logits[i];
+        if (best < 0 || v > best_v) {
+            best_v = v;
+            best = (int)i;
+        }
+    }
+    return best;
+}
+
+static int sample_full_vocab_filtered(
+        const float    *logits,
+        uint32_t        n_vocab,
+        float           temperature,
+        float           top_p,
+        float           min_p,
+        const uint32_t *allow_mask,
+        size_t          allow_words,
+        const uint32_t *deny_mask,
+        size_t          deny_words,
+        uint64_t       *rng) {
+    float max_logit = DS4_NEG_INF;
+    int best = -1;
+    uint32_t finite = 0;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) {
+            continue;
+        }
+        const float v = logits[i];
+        if (!isfinite(v)) continue;
+        finite++;
+        if (best < 0 || v > max_logit) {
+            max_logit = v;
+            best = (int)i;
+        }
+    }
+    if (finite == 0) return sample_argmax_filtered(logits, n_vocab, allow_mask,
+                                                   allow_words, deny_mask,
+                                                   deny_words);
+
+    if (top_p >= 1.0f) {
+        float sum = 0.0f;
+        const float min_rel = min_p > 0.0f ? min_p : 0.0f;
+        for (uint32_t i = 0; i < n_vocab; i++) {
+            if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) {
+                continue;
+            }
+            const float v = logits[i];
+            if (!isfinite(v)) continue;
+            const float p = expf((v - max_logit) / temperature);
+            if (p < min_rel) continue;
+            sum += p;
+        }
+        if (sum <= 0.0f || !isfinite(sum)) return best;
+        float r = sample_rng_f32(rng) * sum;
+        for (uint32_t i = 0; i < n_vocab; i++) {
+            if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) {
+                continue;
+            }
+            const float v = logits[i];
+            if (!isfinite(v)) continue;
+            const float p = expf((v - max_logit) / temperature);
+            if (p < min_rel) continue;
+            r -= p;
+            if (r <= 0.0f) return (int)i;
+        }
+        return best;
+    }
+
+    sample_candidate *cand = xmalloc((size_t)finite * sizeof(cand[0]));
+    uint32_t n = 0;
+    float sum = 0.0f;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) {
+            continue;
+        }
+        const float v = logits[i];
+        if (!isfinite(v)) continue;
+        const float p = expf((v - max_logit) / temperature);
+        cand[n++] = (sample_candidate){.id = (int)i, .logit = v, .prob = p};
+        sum += p;
+    }
+    if (sum <= 0.0f || !isfinite(sum)) {
+        free(cand);
+        return best;
+    }
+
+    qsort(cand, n, sizeof(cand[0]), sample_candidate_cmp_desc);
+    const float min_prob = (cand[0].prob / sum) * (min_p > 0.0f ? min_p : 0.0f);
+    float filtered_sum = 0.0f;
+    uint32_t filtered = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        const float p = cand[i].prob / sum;
+        if (i > 0 && p < min_prob) break;
+        filtered_sum += cand[i].prob;
+        filtered++;
+        if (filtered_sum / sum >= top_p) break;
+    }
+    if (filtered == 0) {
+        free(cand);
+        return best;
+    }
+
+    float r = sample_rng_f32(rng) * filtered_sum;
+    for (uint32_t i = 0; i < filtered; i++) {
+        r -= cand[i].prob;
+        if (r <= 0.0f) {
+            const int id = cand[i].id;
+            free(cand);
+            return id;
+        }
+    }
+    const int id = cand[filtered - 1].id;
+    free(cand);
+    return id;
+}
+
+static int sample_top_p_min_p_filtered(
+        const float    *logits,
+        uint32_t        n_vocab,
+        float           temperature,
+        int             top_k,
+        float           top_p,
+        float           min_p,
+        const uint32_t *allow_mask,
+        size_t          allow_words,
+        const uint32_t *deny_mask,
+        size_t          deny_words,
+        uint64_t       *rng) {
+    if (temperature <= 0.0f) {
+        return sample_argmax_filtered(logits, n_vocab, allow_mask, allow_words,
+                                      deny_mask, deny_words);
+    }
+    if (top_p <= 0.0f || top_p > 1.0f) top_p = 1.0f;
+    if (min_p < 0.0f) min_p = 0.0f;
+    if (top_k <= 0) {
+        return sample_full_vocab_filtered(logits, n_vocab, temperature, top_p,
+                                          min_p, allow_mask, allow_words,
+                                          deny_mask, deny_words, rng);
+    }
+    if (top_k > 1024) top_k = 1024;
+    if ((uint32_t)top_k > n_vocab) top_k = (int)n_vocab;
+
+    int ids[1024];
+    float vals[1024];
+    int n = 0;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        if (!sample_filtered_allows(allow_mask, allow_words, deny_mask, deny_words, i)) {
+            continue;
+        }
+        float v = logits[i];
+        if (!isfinite(v)) continue;
+        if (n == top_k && v <= vals[n - 1]) continue;
+        int j = n < top_k ? n++ : n - 1;
+        while (j > 0 && vals[j - 1] < v) {
+            vals[j] = vals[j - 1];
+            ids[j] = ids[j - 1];
+            j--;
+        }
+        vals[j] = v;
+        ids[j] = (int)i;
+    }
+    if (n == 0) {
+        return sample_argmax_filtered(logits, n_vocab, allow_mask, allow_words,
+                                      deny_mask, deny_words);
+    }
+
+    float probs[1024];
+    const float max_logit = vals[0];
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        probs[i] = expf((vals[i] - max_logit) / temperature);
+        sum += probs[i];
+    }
+    if (sum <= 0.0f || !isfinite(sum)) return ids[0];
+
+    const float min_prob = (probs[0] / sum) * min_p;
+    float filtered_sum = 0.0f;
+    int filtered = 0;
+    for (int i = 0; i < n; i++) {
+        float p = probs[i] / sum;
+        if (i > 0 && p < min_prob) break;
+        filtered_sum += probs[i];
+        filtered++;
+        if (filtered_sum / sum >= top_p) break;
+    }
+    if (filtered <= 0) return ids[0];
+
+    float r = sample_rng_f32(rng) * filtered_sum;
+    for (int i = 0; i < filtered; i++) {
+        r -= probs[i];
+        if (r <= 0.0f) return ids[i];
+    }
+    return ids[filtered - 1];
+}
+
 static void print_top_logits(
         FILE          * fp,
         const char    * label,
@@ -19738,6 +19963,20 @@ int ds4_session_sample(ds4_session *s, float temperature, int top_k, float top_p
     return sample_top_p_min_p(s->logits, DS4_N_VOCAB, temperature, top_k, top_p, min_p, rng);
 }
 
+int ds4_session_sample_masked(ds4_session *s, float temperature, int top_k,
+                              float top_p, float min_p,
+                              const uint32_t *allow_mask,
+                              size_t allow_mask_words,
+                              const uint32_t *deny_mask,
+                              size_t deny_mask_words,
+                              uint64_t *rng) {
+    if (!s || !s->logits || !allow_mask) return -1;
+    return sample_top_p_min_p_filtered(s->logits, DS4_N_VOCAB, temperature,
+                                       top_k, top_p, min_p, allow_mask,
+                                       allow_mask_words, deny_mask,
+                                       deny_mask_words, rng);
+}
+
 int ds4_session_top_logprobs(ds4_session *s, ds4_token_score *out, int k) {
     if (!s || !out || k <= 0) return 0;
     if (k > (int)DS4_N_VOCAB) k = (int)DS4_N_VOCAB;
diff --git a/ds4.h b/ds4.h
index 7b7233c36..4fed595ae 100644
--- a/ds4.h
+++ b/ds4.h
@@ -236,6 +236,13 @@ int ds4_session_argmax_excluding(ds4_session *s, int excluded_id);
 int ds4_sample_logits(const float *logits, int n_vocab, float temperature,
                       int top_k, float top_p, float min_p, uint64_t *rng);
 int ds4_session_sample(ds4_session *s, float temperature, int top_k, float top_p, float min_p, uint64_t *rng);
+int ds4_session_sample_masked(ds4_session *s, float temperature, int top_k,
+                              float top_p, float min_p,
+                              const uint32_t *allow_mask,
+                              size_t allow_mask_words,
+                              const uint32_t *deny_mask,
+                              size_t deny_mask_words,
+                              uint64_t *rng);
 int ds4_session_top_logprobs(ds4_session *s, ds4_token_score *out, int k);
 int ds4_session_token_logprob(ds4_session *s, int token, ds4_token_score *out);
 int ds4_session_copy_logits(ds4_session *s, float *out, int cap);
diff --git a/ds4_llguidance.c b/ds4_llguidance.c
new file mode 100644
index 000000000..1a39ea3e8
--- /dev/null
+++ b/ds4_llguidance.c
@@ -0,0 +1,468 @@
+#include "ds4_llguidance.h"
+
+#include <ctype.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef DS4_USE_LLGUIDANCE
+#include <pthread.h>
+#include "llguidance.h"
+#endif
+
+#ifndef UINT32_C
+#include <stdint.h>
+#endif
+
+struct ds4_llguidance {
+#ifdef DS4_USE_LLGUIDANCE
+    LlgTokenizer *tokenizer;
+    LlgMatcher *matcher;
+    const uint32_t *leading_ws_mask;
+    size_t leading_ws_words;
+    size_t mask_words;
+    int n_vocab;
+    int eos_token;
+    bool started;
+#else
+    int unused;
+#endif
+};
+
+bool ds4_llguidance_available(void) {
+#ifdef DS4_USE_LLGUIDANCE
+    return true;
+#else
+    return false;
+#endif
+}
+
+const char *ds4_llguidance_build_info(void) {
+#ifdef DS4_USE_LLGUIDANCE
+    return "llguidance enabled";
+#else
+    return "llguidance disabled";
+#endif
+}
+
+#ifdef DS4_USE_LLGUIDANCE
+
+typedef struct {
+    ds4_engine *engine;
+    LlgTokenizer *tokenizer;
+    uint32_t *leading_ws_mask;
+    size_t leading_ws_words;
+    int n_vocab;
+} ds4_llg_cache;
+
+static pthread_mutex_t g_llg_cache_mu = PTHREAD_MUTEX_INITIALIZER;
+static ds4_llg_cache g_llg_cache = {0};
+
+static void set_err(char *err, size_t errlen, const char *fmt, ...) {
+    if (!err || errlen == 0) return;
+    va_list ap;
+    va_start(ap, fmt);
+    vsnprintf(err, errlen, fmt, ap);
+    va_end(ap);
+}
+
+static bool json_ws_byte(unsigned char c) {
+    return c == ' ' || c == '\n' || c == '\r' || c == '\t';
+}
+
+static bool bytes_all_json_ws(const char *p, size_t len) {
+    if (!p || len == 0) return false;
+    for (size_t i = 0; i < len; i++) {
+        if (!json_ws_byte((unsigned char)p[i])) return false;
+    }
+    return true;
+}
+
+static bool bytes_have_non_json_ws(const char *p, size_t len) {
+    if (!p) return false;
+    for (size_t i = 0; i < len; i++) {
+        if (!json_ws_byte((unsigned char)p[i])) return true;
+    }
+    return false;
+}
+
+static bool token_text_is_special(const char *p, size_t len) {
+    static const char *specials[] = {
+        "<｜begin▁of▁sentence｜>",
+        "<｜end▁of▁sentence｜>",
+        "<｜User｜>",
+        "<｜Assistant｜>",
+        "<think>",
+        "</think>",
+        "｜DSML｜",
+    };
+    for (size_t i = 0; i < sizeof(specials) / sizeof(specials[0]); i++) {
+        size_t n = strlen(specials[i]);
+        if (len == n && memcmp(p, specials[i], n) == 0) return true;
+    }
+
+    const unsigned char bar[] = {0xef, 0xbd, 0x9c};
+    for (size_t i = 0; i + sizeof(bar) <= len; i++) {
+        if (!memcmp(p + i, bar, sizeof(bar))) return true;
+    }
+    return false;
+}
+
+static void bitset_set(uint32_t *mask, int token) {
+    mask[(uint32_t)token / 32u] |= UINT32_C(1) << ((uint32_t)token & 31u);
+}
+
+static bool bitset_get(const uint32_t *mask, size_t words, uint32_t token) {
+    const size_t word = token / 32u;
+    if (!mask || word >= words) return false;
+    return (mask[word] & (UINT32_C(1) << (token & 31u))) != 0;
+}
+
+static bool mask_has_non_denied_token(const uint32_t *allow,
+                                      size_t allow_words,
+                                      const uint32_t *deny,
+                                      size_t deny_words,
+                                      int n_vocab) {
+    if (!allow) return false;
+    for (int i = 0; i < n_vocab; i++) {
+        if (bitset_get(allow, allow_words, (uint32_t)i) &&
+            !bitset_get(deny, deny_words, (uint32_t)i))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+static size_t ds4_llg_tokenize_fn(const void *user_data,
+                                  const uint8_t *bytes,
+                                  size_t bytes_len,
+                                  uint32_t *output_tokens,
+                                  size_t output_tokens_len) {
+    ds4_engine *e = (ds4_engine *)user_data;
+    char *text = malloc(bytes_len + 1);
+    if (!text) return 0;
+    memcpy(text, bytes, bytes_len);
+    text[bytes_len] = '\0';
+
+    ds4_tokens toks = {0};
+    ds4_tokenize_text(e, text, &toks);
+    free(text);
+
+    const size_t n = toks.len < 0 ? 0 : (size_t)toks.len;
+    const size_t copy = n < output_tokens_len ? n : output_tokens_len;
+    for (size_t i = 0; i < copy; i++) output_tokens[i] = (uint32_t)toks.v[i];
+    ds4_tokens_free(&toks);
+    return n;
+}
+
+static LlgTokenizer *build_tokenizer(ds4_engine *e,
+                                     uint32_t **leading_ws_mask_out,
+                                     size_t *leading_ws_words_out,
+                                     int *n_vocab_out,
+                                     char *err,
+                                     size_t errlen) {
+    const int n_vocab = ds4_engine_vocab_size(e);
+    if (n_vocab <= 0) {
+        set_err(err, errlen, "llguidance tokenizer cannot use an empty vocabulary");
+        return NULL;
+    }
+
+    size_t total = 0;
+    uint32_t *token_lens = calloc((size_t)n_vocab, sizeof(token_lens[0]));
+    if (!token_lens) {
+        set_err(err, errlen, "out of memory");
+        return NULL;
+    }
+
+    const size_t mask_words = ((size_t)n_vocab + 31u) / 32u;
+    uint32_t *leading_ws = calloc(mask_words, sizeof(leading_ws[0]));
+    if (!leading_ws) {
+        free(token_lens);
+        set_err(err, errlen, "out of memory");
+        return NULL;
+    }
+
+    for (int i = 0; i < n_vocab; i++) {
+        size_t len = 0;
+        char *piece = ds4_token_text(e, i, &len);
+        const bool special = token_text_is_special(piece, len);
+        token_lens[i] = (uint32_t)(len + (special ? 1u : 0u));
+        total += token_lens[i];
+        if (!special && bytes_all_json_ws(piece, len)) bitset_set(leading_ws, i);
+        free(piece);
+    }
+
+    uint8_t *token_bytes = malloc(total ? total : 1);
+    if (!token_bytes) {
+        free(leading_ws);
+        free(token_lens);
+        set_err(err, errlen, "out of memory");
+        return NULL;
+    }
+
+    size_t off = 0;
+    for (int i = 0; i < n_vocab; i++) {
+        size_t len = 0;
+        char *piece = ds4_token_text(e, i, &len);
+        if (token_text_is_special(piece, len)) token_bytes[off++] = 0xffu;
+        memcpy(token_bytes + off, piece, len);
+        off += len;
+        free(piece);
+    }
+
+    LlgTokenizerInit init = {0};
+    init.vocab_size = (uint32_t)n_vocab;
+    init.tok_eos = (uint32_t)ds4_token_eos(e);
+    init.token_lens = token_lens;
+    init.token_bytes = token_bytes;
+    init.tokenize_assumes_string = true;
+    init.tokenize_fn = ds4_llg_tokenize_fn;
+    init.use_approximate_greedy_tokenize_fn = false;
+    init.tokenize_user_data = e;
+    init.slices = NULL;
+
+    char llg_err[1024] = {0};
+    LlgTokenizer *tok = llg_new_tokenizer(&init, llg_err, sizeof(llg_err));
+    free(token_bytes);
+    free(token_lens);
+    if (!tok) {
+        free(leading_ws);
+        set_err(err, errlen, "llguidance tokenizer error: %s", llg_err);
+        return NULL;
+    }
+
+    *leading_ws_mask_out = leading_ws;
+    *leading_ws_words_out = mask_words;
+    *n_vocab_out = n_vocab;
+    return tok;
+}
+
+static LlgTokenizer *cached_tokenizer_clone(ds4_engine *e,
+                                            const uint32_t **leading_ws_mask_out,
+                                            size_t *leading_ws_words_out,
+                                            int *n_vocab_out,
+                                            char *err,
+                                            size_t errlen) {
+    LlgTokenizer *clone = NULL;
+    pthread_mutex_lock(&g_llg_cache_mu);
+    if (g_llg_cache.engine != e || !g_llg_cache.tokenizer) {
+        if (g_llg_cache.tokenizer) llg_free_tokenizer(g_llg_cache.tokenizer);
+        free(g_llg_cache.leading_ws_mask);
+        memset(&g_llg_cache, 0, sizeof(g_llg_cache));
+
+        uint32_t *leading_ws = NULL;
+        size_t leading_ws_words = 0;
+        int n_vocab = 0;
+        LlgTokenizer *tok = build_tokenizer(e, &leading_ws, &leading_ws_words,
+                                            &n_vocab, err, errlen);
+        if (!tok) {
+            pthread_mutex_unlock(&g_llg_cache_mu);
+            return NULL;
+        }
+        g_llg_cache.engine = e;
+        g_llg_cache.tokenizer = tok;
+        g_llg_cache.leading_ws_mask = leading_ws;
+        g_llg_cache.leading_ws_words = leading_ws_words;
+        g_llg_cache.n_vocab = n_vocab;
+    }
+
+    clone = llg_clone_tokenizer(g_llg_cache.tokenizer);
+    if (leading_ws_mask_out) *leading_ws_mask_out = g_llg_cache.leading_ws_mask;
+    if (leading_ws_words_out) *leading_ws_words_out = g_llg_cache.leading_ws_words;
+    if (n_vocab_out) *n_vocab_out = g_llg_cache.n_vocab;
+    pthread_mutex_unlock(&g_llg_cache_mu);
+    if (!clone) set_err(err, errlen, "llguidance tokenizer clone failed");
+    return clone;
+}
+
+ds4_llguidance *ds4_llguidance_create(ds4_engine *e,
+                                      const char *constraint_type,
+                                      const char *constraint_data,
+                                      char *err,
+                                      size_t errlen) {
+    if (!e || !constraint_type || !constraint_type[0]) {
+        set_err(err, errlen, "invalid structured output constraint");
+        return NULL;
+    }
+
+    const uint32_t *leading_ws_mask = NULL;
+    size_t leading_ws_words = 0;
+    int n_vocab = 0;
+    LlgTokenizer *tok = cached_tokenizer_clone(e, &leading_ws_mask,
+                                               &leading_ws_words,
+                                               &n_vocab, err, errlen);
+    if (!tok) return NULL;
+
+    LlgConstraintInit init;
+    llg_constraint_init_set_defaults(&init, tok);
+    const char *log_level = getenv("LLGUIDANCE_LOG_LEVEL");
+    if (!log_level || !log_level[0]) log_level = getenv("DS4_LLGUIDANCE_LOG_LEVEL");
+    if (log_level && log_level[0]) init.log_stderr_level = (uint32_t)atoi(log_level);
+
+    LlgMatcher *matcher = llg_new_matcher(&init, constraint_type,
+                                          constraint_data ? constraint_data : "");
+    const char *llg_err = matcher ? llg_matcher_get_error(matcher) : "allocation failed";
+    if (llg_err) {
+        set_err(err, errlen, "llguidance grammar error: %s", llg_err);
+        if (matcher) llg_free_matcher(matcher);
+        llg_free_tokenizer(tok);
+        return NULL;
+    }
+
+    const size_t mask_bytes = llg_matcher_get_mask_byte_size(matcher);
+    const size_t expected = ((size_t)n_vocab + 31u) / 32u * sizeof(uint32_t);
+    if (mask_bytes != expected) {
+        set_err(err, errlen, "llguidance mask size mismatch");
+        llg_free_matcher(matcher);
+        llg_free_tokenizer(tok);
+        return NULL;
+    }
+
+    ds4_llguidance *g = calloc(1, sizeof(*g));
+    if (!g) {
+        set_err(err, errlen, "out of memory");
+        llg_free_matcher(matcher);
+        llg_free_tokenizer(tok);
+        return NULL;
+    }
+    g->tokenizer = tok;
+    g->matcher = matcher;
+    g->leading_ws_mask = leading_ws_mask;
+    g->leading_ws_words = leading_ws_words;
+    g->mask_words = mask_bytes / sizeof(uint32_t);
+    g->n_vocab = n_vocab;
+    g->eos_token = ds4_token_eos(e);
+    g->started = false;
+    return g;
+}
+
+void ds4_llguidance_free(ds4_llguidance *g) {
+    if (!g) return;
+    if (g->matcher) llg_free_matcher(g->matcher);
+    if (g->tokenizer) llg_free_tokenizer(g->tokenizer);
+    free(g);
+}
+
+int ds4_llguidance_sample(ds4_llguidance *g,
+                          ds4_session *s,
+                          float temperature,
+                          int top_k,
+                          float top_p,
+                          float min_p,
+                          uint64_t *rng,
+                          char *err,
+                          size_t errlen) {
+    if (!g || !g->matcher || !s) {
+        set_err(err, errlen, "structured output decoder is not active");
+        return -1;
+    }
+    if (llg_matcher_is_stopped(g->matcher)) return g->eos_token;
+    if (llg_matcher_compute_mask(g->matcher) != 0) {
+        set_err(err, errlen, "llguidance mask error: %s",
+                llg_matcher_get_error(g->matcher));
+        return -1;
+    }
+    const uint32_t *allow = llg_matcher_get_mask(g->matcher);
+    if (!allow) {
+        set_err(err, errlen, "llguidance did not return a token mask");
+        return -1;
+    }
+
+    const uint32_t *deny = NULL;
+    size_t deny_words = 0;
+    if (!g->started &&
+        mask_has_non_denied_token(allow, g->mask_words, g->leading_ws_mask,
+                                  g->leading_ws_words, g->n_vocab))
+    {
+        deny = g->leading_ws_mask;
+        deny_words = g->leading_ws_words;
+    }
+
+    int token = ds4_session_sample_masked(s, temperature, top_k, top_p, min_p,
+                                          allow, g->mask_words, deny,
+                                          deny_words, rng);
+    if (token < 0) set_err(err, errlen, "llguidance mask allowed no sampleable token");
+    return token;
+}
+
+bool ds4_llguidance_accept(ds4_llguidance *g,
+                           ds4_engine *e,
+                           int token,
+                           char *err,
+                           size_t errlen) {
+    if (!g || !g->matcher) return true;
+    if (token < 0) return true;
+    if (llg_matcher_consume_token(g->matcher, (uint32_t)token) != 0) {
+        set_err(err, errlen, "llguidance consume error: %s",
+                llg_matcher_get_error(g->matcher));
+        return false;
+    }
+    if (!g->started && e) {
+        size_t len = 0;
+        char *piece = ds4_token_text(e, token, &len);
+        if (bytes_have_non_json_ws(piece, len)) g->started = true;
+        free(piece);
+    }
+    return true;
+}
+
+#else
+
+ds4_llguidance *ds4_llguidance_create(ds4_engine *e,
+                                      const char *constraint_type,
+                                      const char *constraint_data,
+                                      char *err,
+                                      size_t errlen) {
+    (void)e;
+    (void)constraint_type;
+    (void)constraint_data;
+    if (err && errlen) {
+        snprintf(err, errlen,
+                 "structured outputs require building ds4 with LLGUIDANCE=1");
+    }
+    return NULL;
+}
+
+void ds4_llguidance_free(ds4_llguidance *g) {
+    (void)g;
+}
+
+int ds4_llguidance_sample(ds4_llguidance *g,
+                          ds4_session *s,
+                          float temperature,
+                          int top_k,
+                          float top_p,
+                          float min_p,
+                          uint64_t *rng,
+                          char *err,
+                          size_t errlen) {
+    (void)g;
+    (void)s;
+    (void)temperature;
+    (void)top_k;
+    (void)top_p;
+    (void)min_p;
+    (void)rng;
+    if (err && errlen) {
+        snprintf(err, errlen,
+                 "structured outputs require building ds4 with LLGUIDANCE=1");
+    }
+    return -1;
+}
+
+bool ds4_llguidance_accept(ds4_llguidance *g,
+                           ds4_engine *e,
+                           int token,
+                           char *err,
+                           size_t errlen) {
+    (void)g;
+    (void)e;
+    (void)token;
+    (void)err;
+    (void)errlen;
+    return true;
+}
+
+#endif
diff --git a/ds4_llguidance.h b/ds4_llguidance.h
new file mode 100644
index 000000000..f677f3b13
--- /dev/null
+++ b/ds4_llguidance.h
@@ -0,0 +1,37 @@
+#ifndef DS4_LLGUIDANCE_H
+#define DS4_LLGUIDANCE_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "ds4.h"
+
+typedef struct ds4_llguidance ds4_llguidance;
+
+bool ds4_llguidance_available(void);
+const char *ds4_llguidance_build_info(void);
+
+ds4_llguidance *ds4_llguidance_create(ds4_engine *e,
+                                      const char *constraint_type,
+                                      const char *constraint_data,
+                                      char *err,
+                                      size_t errlen);
+void ds4_llguidance_free(ds4_llguidance *g);
+
+int ds4_llguidance_sample(ds4_llguidance *g,
+                          ds4_session *s,
+                          float temperature,
+                          int top_k,
+                          float top_p,
+                          float min_p,
+                          uint64_t *rng,
+                          char *err,
+                          size_t errlen);
+bool ds4_llguidance_accept(ds4_llguidance *g,
+                           ds4_engine *e,
+                           int token,
+                           char *err,
+                           size_t errlen);
+
+#endif
diff --git a/ds4_server.c b/ds4_server.c
index 2cd33b18a..add158ca4 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -1,6 +1,7 @@
 #include "ds4.h"
 #include "ds4_distributed.h"
 #include "ds4_kvstore.h"
+#include "ds4_llguidance.h"
 #include "rax.h"
 
 /* OpenAI/Anthropic compatible local server.
@@ -402,6 +403,392 @@ static char *json_minify_raw_value(const char *json) {
     return buf_take(&b);
 }
 
+typedef enum {
+    DS4_TEXT_FORMAT_TEXT,
+    DS4_TEXT_FORMAT_JSON_OBJECT,
+    DS4_TEXT_FORMAT_JSON_SCHEMA,
+} ds4_text_format_type;
+
+typedef struct {
+    ds4_text_format_type type;
+    char *name;
+    char *schema_json;
+    bool strict;
+} ds4_text_format;
+
+static void ds4_text_format_clear(ds4_text_format *f) {
+    if (!f) return;
+    free(f->name);
+    free(f->schema_json);
+    memset(f, 0, sizeof(*f));
+}
+
+static bool ds4_text_format_is_json(const ds4_text_format *f) {
+    return f && (f->type == DS4_TEXT_FORMAT_JSON_OBJECT ||
+                 f->type == DS4_TEXT_FORMAT_JSON_SCHEMA);
+}
+
+static void ds4_text_format_set_schema(ds4_text_format *f,
+                                       ds4_text_format_type type,
+                                       char *name,
+                                       char *schema_json,
+                                       bool strict) {
+    ds4_text_format_clear(f);
+    f->type = type;
+    f->name = name;
+    f->schema_json = schema_json;
+    f->strict = strict;
+}
+
+static const char *ds4_text_format_constraint_type(const ds4_text_format *f) {
+    if (!f) return "text";
+    if (f->type == DS4_TEXT_FORMAT_JSON_SCHEMA) return "json_schema";
+    if (f->type == DS4_TEXT_FORMAT_JSON_OBJECT) {
+        return f->schema_json ? "json_schema" : "json_object";
+    }
+    return "text";
+}
+
+static const char *ds4_text_format_constraint_data(const ds4_text_format *f) {
+    return f && f->schema_json ? f->schema_json : "";
+}
+
+static bool ds4_text_format_validate_with_llguidance(ds4_engine *e,
+                                                     const ds4_text_format *f,
+                                                     char *err,
+                                                     size_t errlen) {
+    if (!ds4_text_format_is_json(f)) return true;
+    if (!ds4_llguidance_available()) {
+        snprintf(err, errlen,
+                 "structured outputs require building ds4 with LLGUIDANCE=1");
+        return false;
+    }
+
+    char llg_err[160] = {0};
+    ds4_llguidance *g = ds4_llguidance_create(
+        e,
+        ds4_text_format_constraint_type(f),
+        ds4_text_format_constraint_data(f),
+        llg_err,
+        sizeof(llg_err));
+    if (!g) {
+        snprintf(err, errlen, "invalid structured output schema: %s",
+                 llg_err[0] ? llg_err : "llguidance rejected constraint");
+        return false;
+    }
+    ds4_llguidance_free(g);
+    return true;
+}
+
+static bool parse_json_schema_wrapper(const char **p,
+                                      ds4_text_format *format,
+                                      char *err,
+                                      size_t errlen) {
+    json_ws(p);
+    if (**p != '{') return false;
+    (*p)++;
+    char *name = NULL;
+    char *schema = NULL;
+    bool strict = false;
+    json_ws(p);
+    while (**p && **p != '}') {
+        char *key = NULL;
+        if (!json_string(p, &key)) goto bad;
+        json_ws(p);
+        if (**p != ':') {
+            free(key);
+            goto bad;
+        }
+        (*p)++;
+        if (!strcmp(key, "name")) {
+            free(name);
+            if (!json_string(p, &name)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "schema")) {
+            free(schema);
+            if (!json_raw_value(p, &schema)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "strict")) {
+            if (!json_bool(p, &strict)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!json_skip_value(p)) {
+            free(key);
+            goto bad;
+        }
+        free(key);
+        json_ws(p);
+        if (**p == ',') (*p)++;
+        json_ws(p);
+    }
+    if (**p != '}') goto bad;
+    (*p)++;
+    if (!schema) {
+        snprintf(err, errlen, "json_schema.schema is required");
+        free(name);
+        return false;
+    }
+    ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA,
+                               name, schema, strict);
+    return true;
+bad:
+    free(name);
+    free(schema);
+    return false;
+}
+
+static bool parse_chat_response_format(const char **p,
+                                       ds4_text_format *format,
+                                       char *err,
+                                       size_t errlen) {
+    json_ws(p);
+    if (json_lit(p, "null")) {
+        ds4_text_format_clear(format);
+        return true;
+    }
+    if (**p != '{') return false;
+    (*p)++;
+
+    char *type = NULL;
+    char *schema = NULL;
+    char *name = NULL;
+    bool strict = false;
+    bool saw_json_schema = false;
+    json_ws(p);
+    while (**p && **p != '}') {
+        char *key = NULL;
+        if (!json_string(p, &key)) goto bad;
+        json_ws(p);
+        if (**p != ':') {
+            free(key);
+            goto bad;
+        }
+        (*p)++;
+        if (!strcmp(key, "type")) {
+            free(type);
+            if (!json_string(p, &type)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "json_schema")) {
+            saw_json_schema = true;
+            if (!parse_json_schema_wrapper(p, format, err, errlen)) {
+                free(key);
+                goto bad_keep_err;
+            }
+        } else if (!strcmp(key, "schema")) {
+            free(schema);
+            if (!json_raw_value(p, &schema)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "name")) {
+            free(name);
+            if (!json_string(p, &name)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "strict")) {
+            if (!json_bool(p, &strict)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!json_skip_value(p)) {
+            free(key);
+            goto bad;
+        }
+        free(key);
+        json_ws(p);
+        if (**p == ',') (*p)++;
+        json_ws(p);
+    }
+    if (**p != '}') goto bad;
+    (*p)++;
+
+    if (!type || !strcmp(type, "text")) {
+        ds4_text_format_clear(format);
+    } else if (!strcmp(type, "json_object")) {
+        if (schema) {
+            ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA,
+                                       name, schema, strict);
+            name = NULL;
+            schema = NULL;
+        } else {
+            ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_OBJECT,
+                                       NULL, NULL, false);
+        }
+    } else if (!strcmp(type, "json_schema")) {
+        if (!saw_json_schema && schema) {
+            ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA,
+                                       name, schema, strict);
+            name = NULL;
+            schema = NULL;
+        } else if (!format->schema_json) {
+            snprintf(err, errlen, "response_format json_schema.schema is required");
+            goto bad_keep_err;
+        }
+    } else {
+        snprintf(err, errlen, "response_format.type=%s not supported", type);
+        goto bad_keep_err;
+    }
+
+    free(type);
+    free(name);
+    free(schema);
+    return true;
+bad:
+    snprintf(err, errlen, "invalid response_format");
+bad_keep_err:
+    free(type);
+    free(name);
+    free(schema);
+    return false;
+}
+
+static bool parse_responses_text_format_object(const char **p,
+                                               ds4_text_format *format,
+                                               char *err,
+                                               size_t errlen) {
+    json_ws(p);
+    if (json_lit(p, "null")) {
+        ds4_text_format_clear(format);
+        return true;
+    }
+    if (**p != '{') return false;
+    (*p)++;
+    char *type = NULL;
+    char *name = NULL;
+    char *schema = NULL;
+    bool strict = false;
+    json_ws(p);
+    while (**p && **p != '}') {
+        char *key = NULL;
+        if (!json_string(p, &key)) goto bad;
+        json_ws(p);
+        if (**p != ':') {
+            free(key);
+            goto bad;
+        }
+        (*p)++;
+        if (!strcmp(key, "type")) {
+            free(type);
+            if (!json_string(p, &type)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "name")) {
+            free(name);
+            if (!json_string(p, &name)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "schema")) {
+            free(schema);
+            if (!json_raw_value(p, &schema)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "strict")) {
+            if (!json_bool(p, &strict)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!json_skip_value(p)) {
+            free(key);
+            goto bad;
+        }
+        free(key);
+        json_ws(p);
+        if (**p == ',') (*p)++;
+        json_ws(p);
+    }
+    if (**p != '}') goto bad;
+    (*p)++;
+
+    if (!type || !strcmp(type, "text")) {
+        ds4_text_format_clear(format);
+    } else if (!strcmp(type, "json_object")) {
+        if (schema) {
+            ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA,
+                                       name, schema, strict);
+            name = NULL;
+            schema = NULL;
+        } else {
+            ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_OBJECT,
+                                       NULL, NULL, false);
+        }
+    } else if (!strcmp(type, "json_schema")) {
+        if (!schema) {
+            snprintf(err, errlen, "text.format.schema is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_schema(format, DS4_TEXT_FORMAT_JSON_SCHEMA,
+                                   name, schema, strict);
+        name = NULL;
+        schema = NULL;
+    } else {
+        snprintf(err, errlen, "text.format.type=%s not supported", type);
+        goto bad_keep_err;
+    }
+
+    free(type);
+    free(name);
+    free(schema);
+    return true;
+bad:
+    snprintf(err, errlen, "invalid text.format");
+bad_keep_err:
+    free(type);
+    free(name);
+    free(schema);
+    return false;
+}
+
+static bool parse_responses_text_value(const char **p,
+                                       ds4_text_format *format,
+                                       char *err,
+                                       size_t errlen) {
+    json_ws(p);
+    if (json_lit(p, "null")) {
+        ds4_text_format_clear(format);
+        return true;
+    }
+    if (**p != '{') return false;
+    (*p)++;
+    json_ws(p);
+    while (**p && **p != '}') {
+        char *key = NULL;
+        if (!json_string(p, &key)) return false;
+        json_ws(p);
+        if (**p != ':') {
+            free(key);
+            return false;
+        }
+        (*p)++;
+        if (!strcmp(key, "format")) {
+            if (!parse_responses_text_format_object(p, format, err, errlen)) {
+                free(key);
+                return false;
+            }
+        } else if (!json_skip_value(p)) {
+            free(key);
+            return false;
+        }
+        free(key);
+        json_ws(p);
+        if (**p == ',') (*p)++;
+        json_ws(p);
+    }
+    if (**p != '}') return false;
+    (*p)++;
+    return true;
+}
+
 static bool json_content(const char **p, char **out) {
     json_ws(p);
     if (**p == '"') return json_string(p, out);
@@ -601,6 +988,7 @@ typedef struct {
     int cache_read_tokens;
     int cache_write_tokens;
     ds4_think_mode think_mode;
+    ds4_text_format text_format;
     bool has_tools;
     bool prompt_preserves_reasoning;
     /* For /v1/responses: emit reasoning_summary_* events / fields only when the
@@ -763,6 +1151,7 @@ static void request_free(request *r) {
     free(r->stops.v);
     free(r->raw_body);
     free(r->prompt_text);
+    ds4_text_format_clear(&r->text_format);
     stop_list_clear(&r->responses_live_call_ids);
     free(r->responses_live_call_ids.v);
     free(r->responses_live_suffix_text);
@@ -2726,6 +3115,15 @@ static bool parse_chat_request(ds4_engine *e, server *s, const char *body, int d
                 free(key);
                 goto bad;
             }
+        } else if (!strcmp(key, "response_format")) {
+            if (!parse_chat_response_format(&p, &r->text_format, err, errlen)) {
+                free(key);
+                chat_msgs_free(&msgs);
+                free(tool_schemas);
+                if (!err[0]) snprintf(err, errlen, "invalid response_format");
+                request_free(r);
+                return false;
+            }
         } else if (!strcmp(key, "thinking")) {
             if (!parse_thinking_control_value(&p, &thinking_enabled)) {
                 free(key);
@@ -2766,6 +3164,25 @@ static bool parse_chat_request(ds4_engine *e, server *s, const char *body, int d
         return false;
     }
     r->has_tools = tool_schemas && tool_schemas[0] && !tool_choice_none;
+    if (ds4_text_format_is_json(&r->text_format)) {
+        if (r->has_tools) {
+            snprintf(err, errlen,
+                     "structured outputs with tools are not supported");
+            chat_msgs_free(&msgs);
+            free(tool_schemas);
+            request_free(r);
+            return false;
+        }
+        if (!ds4_text_format_validate_with_llguidance(e, &r->text_format,
+                                                      err, errlen)) {
+            chat_msgs_free(&msgs);
+            free(tool_schemas);
+            request_free(r);
+            return false;
+        }
+        thinking_enabled = false;
+        got_thinking = true;
+    }
     if (!got_thinking && model_alias_disables_thinking(r->model)) thinking_enabled = false;
     if (!got_thinking && model_alias_enables_thinking(r->model)) thinking_enabled = true;
     r->think_mode = ds4_think_mode_for_context(
@@ -3815,6 +4232,17 @@ static bool parse_responses_request(ds4_engine *e, server *s, const char *body,
                 free(key);
                 goto bad;
             }
+        } else if (!strcmp(key, "text")) {
+            if (!parse_responses_text_value(&p, &r->text_format, err, errlen)) {
+                free(key);
+                chat_msgs_free(&msgs);
+                buf_free(&loaded_tool_schemas);
+                free(instructions);
+                free(tool_schemas);
+                if (!err[0]) snprintf(err, errlen, "invalid text");
+                request_free(r);
+                return false;
+            }
         } else if (!strcmp(key, "reasoning")) {
             bool effort_seen = false;
             if (!parse_responses_reasoning(&p, &reasoning_effort,
@@ -3904,6 +4332,32 @@ static bool parse_responses_request(ds4_engine *e, server *s, const char *body,
         (!tool_choice_none && combined_tool_schemas.len) ?
         combined_tool_schemas.ptr : NULL;
     r->has_tools = active_tool_schemas && active_tool_schemas[0];
+    if (ds4_text_format_is_json(&r->text_format)) {
+        if (r->has_tools) {
+            snprintf(err, errlen,
+                     "structured outputs with tools are not supported");
+            chat_msgs_free(&msgs);
+            buf_free(&combined_tool_schemas);
+            buf_free(&loaded_tool_schemas);
+            free(instructions);
+            free(tool_schemas);
+            request_free(r);
+            return false;
+        }
+        if (!ds4_text_format_validate_with_llguidance(e, &r->text_format,
+                                                      err, errlen)) {
+            chat_msgs_free(&msgs);
+            buf_free(&combined_tool_schemas);
+            buf_free(&loaded_tool_schemas);
+            free(instructions);
+            free(tool_schemas);
+            request_free(r);
+            return false;
+        }
+        thinking_enabled = false;
+        got_thinking = true;
+        r->reasoning_summary_emit = false;
+    }
     if (!got_thinking && model_alias_disables_thinking(r->model)) thinking_enabled = false;
     if (!got_thinking && model_alias_enables_thinking(r->model)) thinking_enabled = true;
     r->think_mode = ds4_think_mode_for_context(
@@ -5966,6 +6420,10 @@ static bool request_uses_structured_stream(const request *r) {
                          request_uses_openai_live_stream(r));
 }
 
+static bool request_uses_structured_decoder(const request *r) {
+    return r && r->kind == REQ_CHAT && ds4_text_format_is_json(&r->text_format);
+}
+
 /* Codex' Responses API uses 24-hex suffixes for response/item ids. Prefix
  * controls the variant (resp_, rs_, msg_, fc_) so each event references a
  * stable identifier across output_item.added / .done. */
@@ -9907,6 +10365,7 @@ static bool should_canonicalize_tool_checkpoint(const server *s, const tool_call
 static void generate_job(server *s, job *j) {
     char err[160];
     err[0] = '\0';
+    ds4_llguidance *structured = NULL;
     const int old_pos = ds4_session_pos(s->session);
     const int common = ds4_session_common_prefix(s->session, &j->req.prompt);
     trace_cache_diag cache_diag = {0};
@@ -10064,6 +10523,25 @@ static void generate_job(server *s, job *j) {
     char req_flags[64];
     log_flags(req_flags, sizeof(req_flags), responses_protocol,
               j->req.has_tools, false, false, false);
+    if (request_uses_structured_decoder(&j->req)) {
+        structured = ds4_llguidance_create(
+            s->engine,
+            ds4_text_format_constraint_type(&j->req.text_format),
+            ds4_text_format_constraint_data(&j->req.text_format),
+            err,
+            sizeof(err));
+        if (!structured) {
+            ds4_tokens_free(&effective_prompt);
+            free(disk_cache_path);
+            trace_event(s, trace_id, "structured output init failed: %s",
+                        err[0] ? err : "unknown error");
+            http_error(j->fd, s->enable_cors, 400,
+                       err[0] ? err : "structured output init failed");
+            return;
+        }
+        trace_event(s, trace_id, "structured output constraint=%s",
+                    ds4_text_format_constraint_type(&j->req.text_format));
+    }
     if (responses_live_continuation) {
         server_log(DS4_LOG_PREFILL,
                    "ds4-server: responses live continuation RESPPROTO match=%s ids=%d cached=%d prompt=%d",
@@ -10149,6 +10627,7 @@ static void generate_job(server *s, job *j) {
                                                   cold_store_len);
             kv_cache_discard_failed_disk_entry(s, disk_cache_path);
             free(disk_cache_path);
+            ds4_llguidance_free(structured);
             trace_event(s, trace_id, "prefill failed: %s", err);
             send_prefill_failure_response(s, j, &progress, ctx_span, req_flags, err);
             return;
@@ -10172,6 +10651,7 @@ static void generate_job(server *s, job *j) {
                                               cold_store_len);
         kv_cache_discard_failed_disk_entry(s, disk_cache_path);
         free(disk_cache_path);
+        ds4_llguidance_free(structured);
         trace_event(s, trace_id, "prefill failed: %s", err);
         send_prefill_failure_response(s, j, &progress, ctx_span, req_flags, err);
         return;
@@ -10222,6 +10702,7 @@ static void generate_job(server *s, job *j) {
                        req_flags[0] ? " " : "",
                        req_flags);
             ds4_tokens_free(&effective_prompt);
+            ds4_llguidance_free(structured);
             return;
         }
         /* The prefill progress callback may have already sent the SSE headers
@@ -10235,6 +10716,7 @@ static void generate_job(server *s, job *j) {
                        req_flags[0] ? " " : "",
                        req_flags);
             ds4_tokens_free(&effective_prompt);
+            ds4_llguidance_free(structured);
             return;
         }
         progress.headers_sent = true;
@@ -10243,12 +10725,14 @@ static void generate_job(server *s, job *j) {
                                       prompt_tokens, &anthropic_live)) {
             server_log(DS4_LOG_GENERATION, "ds4-server: chat ctx=%s anthropic stream start failed", ctx_span);
             ds4_tokens_free(&effective_prompt);
+            ds4_llguidance_free(structured);
             return;
         }
         if (j->req.api == API_OPENAI && j->req.kind == REQ_CHAT &&
             !sse_chunk(j->fd, &j->req, id, NULL, NULL)) {
             server_log(DS4_LOG_GENERATION, "ds4-server: chat ctx=%s openai role chunk failed", ctx_span);
             ds4_tokens_free(&effective_prompt);
+            ds4_llguidance_free(structured);
             return;
         }
         if (openai_live_chat) openai_stream_start(&j->req, &openai_live);
@@ -10263,6 +10747,7 @@ static void generate_job(server *s, job *j) {
                            req_flags);
                 responses_stream_free(&responses_live);
                 ds4_tokens_free(&effective_prompt);
+                ds4_llguidance_free(structured);
                 return;
             }
         }
@@ -10320,7 +10805,15 @@ static void generate_job(server *s, job *j) {
         if (in_tool_call && !dsml_decode_state_uses_payload_sampling(dsml_state)) {
             temperature = 0.0f;
         }
-        int token = ds4_session_sample(s->session, temperature, top_k, top_p, min_p, &rng);
+        int token = structured ?
+            ds4_llguidance_sample(structured, s->session,
+                                  temperature, top_k, top_p, min_p,
+                                  &rng, err, sizeof(err)) :
+            ds4_session_sample(s->session, temperature, top_k, top_p, min_p, &rng);
+        if (token < 0) {
+            finish = "error";
+            break;
+        }
         if (token == ds4_token_eos(s->engine)) {
             finish = "stop";
             break;
@@ -10328,7 +10821,8 @@ static void generate_job(server *s, job *j) {
 
         int toks[17];
         int ntok = 0;
-        if (temperature <= 0.0f &&
+        if (!structured &&
+            temperature <= 0.0f &&
             ds4_engine_mtp_draft_tokens(s->engine) > 1 &&
             getenv("DS4_MTP_SPEC_DISABLE") == NULL)
         {
@@ -10361,6 +10855,13 @@ static void generate_job(server *s, job *j) {
                 stop_decode = true;
                 break;
             }
+            if (structured &&
+                !ds4_llguidance_accept(structured, s->engine, token,
+                                       err, sizeof(err))) {
+                finish = "error";
+                stop_decode = true;
+                break;
+            }
 
             size_t piece_len = 0;
             char *piece = ds4_token_text(s->engine, token, &piece_len);
@@ -10916,6 +11417,7 @@ static void generate_job(server *s, job *j) {
     anthropic_stream_free(&anthropic_live);
     openai_stream_free(&openai_live);
     responses_stream_free(&responses_live);
+    ds4_llguidance_free(structured);
     buf_free(&text);
     ds4_tokens_free(&effective_prompt);
 }
@@ -11767,6 +12269,116 @@ static void test_assert(bool cond, const char *file, int line, const char *expr)
 
 #define TEST_ASSERT(expr) test_assert((expr), __FILE__, __LINE__, #expr)
 
+static void test_parse_chat_response_format_json_schema(void) {
+    const char *json =
+        "{\"type\":\"json_schema\",\"json_schema\":{"
+        "\"name\":\"calendar_event\",\"strict\":true,"
+        "\"schema\":{\"type\":\"object\",\"properties\":{"
+        "\"name\":{\"type\":\"string\"}},\"required\":[\"name\"],"
+        "\"additionalProperties\":false}}}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(parse_chat_response_format(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_SCHEMA);
+    TEST_ASSERT(fmt.name && !strcmp(fmt.name, "calendar_event"));
+    TEST_ASSERT(fmt.strict);
+    TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, "\"additionalProperties\""));
+    json_ws(&p);
+    TEST_ASSERT(*p == '\0');
+
+    ds4_text_format_clear(&fmt);
+}
+
+static void test_parse_chat_response_format_json_object(void) {
+    const char *json = "{\"type\":\"json_object\"}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(parse_chat_response_format(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_OBJECT);
+    TEST_ASSERT(fmt.schema_json == NULL);
+    TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), "json_object"));
+
+    ds4_text_format_clear(&fmt);
+}
+
+static void test_parse_chat_response_format_rejects_missing_schema(void) {
+    const char *json = "{\"type\":\"json_schema\",\"json_schema\":{\"name\":\"bad\"}}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(!parse_chat_response_format(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(strstr(err, "schema is required") != NULL);
+
+    ds4_text_format_clear(&fmt);
+}
+
+static void test_parse_responses_text_format_json_schema(void) {
+    const char *json =
+        "{\"format\":{\"type\":\"json_schema\","
+        "\"name\":\"calendar_event\",\"strict\":true,"
+        "\"schema\":{\"type\":\"object\",\"properties\":{"
+        "\"date\":{\"type\":\"string\"}},\"required\":[\"date\"],"
+        "\"additionalProperties\":false}}}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_SCHEMA);
+    TEST_ASSERT(fmt.name && !strcmp(fmt.name, "calendar_event"));
+    TEST_ASSERT(fmt.strict);
+    TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, "\"required\""));
+    TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), "json_schema"));
+    json_ws(&p);
+    TEST_ASSERT(*p == '\0');
+
+    ds4_text_format_clear(&fmt);
+}
+
+static void test_parse_responses_text_format_json_object(void) {
+    const char *json = "{\"format\":{\"type\":\"json_object\"}}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_JSON_OBJECT);
+    TEST_ASSERT(fmt.schema_json == NULL);
+    TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt), "json_object"));
+
+    ds4_text_format_clear(&fmt);
+}
+
+static void test_parse_responses_text_format_rejects_unknown_type(void) {
+    const char *json = "{\"format\":{\"type\":\"xml\"}}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(!parse_responses_text_value(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(strstr(err, "not supported") != NULL);
+
+    ds4_text_format_clear(&fmt);
+}
+
+static void test_parse_responses_text_format_text_is_noop(void) {
+    const char *json = "{\"format\":{\"type\":\"text\"}}";
+    const char *p = json;
+    ds4_text_format fmt = {0};
+    char err[160] = {0};
+
+    TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err)));
+    TEST_ASSERT(fmt.type == DS4_TEXT_FORMAT_TEXT);
+    TEST_ASSERT(fmt.schema_json == NULL);
+
+    ds4_text_format_clear(&fmt);
+}
+
 static void test_tool_schema_order_from_anthropic_schema(void) {
     tool_schema_orders orders = {0};
     tool_schema_orders_add_json(&orders,
@@ -15554,6 +16166,13 @@ static void ds4_server_unit_tests_run(void) {
     test_render_drops_old_reasoning_without_tools();
     test_render_preserves_reasoning_with_tools();
     test_render_chat_prompt_text_renders_tools_before_system();
+    test_parse_chat_response_format_json_schema();
+    test_parse_chat_response_format_json_object();
+    test_parse_chat_response_format_rejects_missing_schema();
+    test_parse_responses_text_format_json_schema();
+    test_parse_responses_text_format_json_object();
+    test_parse_responses_text_format_rejects_unknown_type();
+    test_parse_responses_text_format_text_is_noop();
     test_tool_schema_order_from_anthropic_schema();
     test_tool_schema_order_from_openai_tools();
     test_tool_schema_order_from_responses_tool_search();
diff --git a/tests/structured_outputs_stress.py b/tests/structured_outputs_stress.py
new file mode 100755
index 000000000..9bc7610fc
--- /dev/null
+++ b/tests/structured_outputs_stress.py
@@ -0,0 +1,424 @@
+#!/usr/bin/env python3
+"""Stress JSON structured outputs on OpenAI-compatible chat/responses APIs.
+
+Examples:
+  python3 tests/structured_outputs_stress.py \
+      --base-url http://127.0.0.1:8000/v1 --model ds4 --apis chat,responses
+
+  python3 tests/structured_outputs_stress.py \
+      --base-url http://127.0.0.1:8080/v1 --model qwen --apis chat
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import time
+import urllib.error
+import urllib.request
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass(frozen=True)
+class Case:
+    name: str
+    prompt: str
+    schema: dict[str, Any] | None
+    json_object: bool = False
+
+
+CASES: list[Case] = [
+    Case(
+        name="calendar_event",
+        prompt=(
+            "Create one calendar event for Alice and Bob having lunch on "
+            "2026-06-01 at noon. Return only the requested JSON object."
+        ),
+        schema={
+            "type": "object",
+            "properties": {
+                "name": {"type": "string"},
+                "date": {"type": "string"},
+                "participants": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "minItems": 1,
+                    "maxItems": 5,
+                },
+            },
+            "required": ["name", "date", "participants"],
+            "additionalProperties": False,
+        },
+    ),
+    Case(
+        name="enum_const_integer_boolean",
+        prompt=(
+            "Return a compact health-check result. Use status ok, one priority, "
+            "a retry count, and whether the system is active."
+        ),
+        schema={
+            "type": "object",
+            "properties": {
+                "status": {"const": "ok"},
+                "priority": {"type": "string", "enum": ["low", "medium", "high"]},
+                "retry_count": {"type": "integer", "minimum": 0, "maximum": 5},
+                "active": {"type": "boolean"},
+            },
+            "required": ["status", "priority", "retry_count", "active"],
+            "additionalProperties": False,
+        },
+    ),
+    Case(
+        name="nested_arrays",
+        prompt=(
+            "Return a 2 by 2 integer matrix and two short labels. Keep values "
+            "small and return only JSON."
+        ),
+        schema={
+            "type": "object",
+            "properties": {
+                "matrix": {
+                    "type": "array",
+                    "minItems": 2,
+                    "maxItems": 2,
+                    "items": {
+                        "type": "array",
+                        "minItems": 2,
+                        "maxItems": 2,
+                        "items": {"type": "integer", "minimum": -9, "maximum": 9},
+                    },
+                },
+                "labels": {
+                    "type": "array",
+                    "minItems": 2,
+                    "maxItems": 2,
+                    "items": {"type": "string"},
+                },
+            },
+            "required": ["matrix", "labels"],
+            "additionalProperties": False,
+        },
+    ),
+    Case(
+        name="nullable_anyof_number_bounds",
+        prompt=(
+            "Return a score between zero and one, and use either an owner name "
+            "or null if unknown."
+        ),
+        schema={
+            "type": "object",
+            "properties": {
+                "owner": {"anyOf": [{"type": "string"}, {"type": "null"}]},
+                "score": {"type": "number", "minimum": 0, "maximum": 1},
+            },
+            "required": ["owner", "score"],
+            "additionalProperties": False,
+        },
+    ),
+    Case(
+        name="pattern_string",
+        prompt="Return an inventory code in the form two uppercase letters, dash, three digits.",
+        schema={
+            "type": "object",
+            "properties": {
+                "code": {"type": "string", "pattern": "^[A-Z]{2}-[0-9]{3}$"}
+            },
+            "required": ["code"],
+            "additionalProperties": False,
+        },
+    ),
+    Case(
+        name="json_object_mode",
+        prompt="Return a JSON object with two fields describing a tiny task list.",
+        schema=None,
+        json_object=True,
+    ),
+]
+
+
+class ValidationError(Exception):
+    pass
+
+
+def type_matches(value: Any, typ: str) -> bool:
+    if typ == "object":
+        return isinstance(value, dict)
+    if typ == "array":
+        return isinstance(value, list)
+    if typ == "string":
+        return isinstance(value, str)
+    if typ == "integer":
+        return isinstance(value, int) and not isinstance(value, bool)
+    if typ == "number":
+        return (isinstance(value, int) or isinstance(value, float)) and not isinstance(value, bool)
+    if typ == "boolean":
+        return isinstance(value, bool)
+    if typ == "null":
+        return value is None
+    return True
+
+
+def validate_schema(value: Any, schema: dict[str, Any], path: str = "$") -> None:
+    if "anyOf" in schema:
+        errors: list[str] = []
+        for option in schema["anyOf"]:
+            try:
+                validate_schema(value, option, path)
+                return
+            except ValidationError as exc:
+                errors.append(str(exc))
+        raise ValidationError(f"{path}: did not match anyOf: {'; '.join(errors)}")
+
+    if "const" in schema and value != schema["const"]:
+        raise ValidationError(f"{path}: expected const {schema['const']!r}, got {value!r}")
+    if "enum" in schema and value not in schema["enum"]:
+        raise ValidationError(f"{path}: expected one of {schema['enum']!r}, got {value!r}")
+
+    typ = schema.get("type")
+    if isinstance(typ, list):
+        if not any(type_matches(value, t) for t in typ):
+            raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ}")
+    elif isinstance(typ, str) and not type_matches(value, typ):
+        raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ}")
+
+    if typ == "object" or "properties" in schema:
+        if not isinstance(value, dict):
+            raise ValidationError(f"{path}: expected object")
+        props = schema.get("properties", {})
+        for key in schema.get("required", []):
+            if key not in value:
+                raise ValidationError(f"{path}: missing required property {key!r}")
+        if schema.get("additionalProperties") is False:
+            extra = sorted(set(value) - set(props))
+            if extra:
+                raise ValidationError(f"{path}: extra properties {extra!r}")
+        for key, sub in props.items():
+            if key in value:
+                validate_schema(value[key], sub, f"{path}.{key}")
+
+    if typ == "array" or "items" in schema:
+        if not isinstance(value, list):
+            raise ValidationError(f"{path}: expected array")
+        min_items = schema.get("minItems")
+        max_items = schema.get("maxItems")
+        if min_items is not None and len(value) < min_items:
+            raise ValidationError(f"{path}: expected at least {min_items} items")
+        if max_items is not None and len(value) > max_items:
+            raise ValidationError(f"{path}: expected at most {max_items} items")
+        items = schema.get("items")
+        if isinstance(items, dict):
+            for i, item in enumerate(value):
+                validate_schema(item, items, f"{path}[{i}]")
+
+    if isinstance(value, str) and "pattern" in schema:
+        if re.fullmatch(schema["pattern"], value) is None:
+            raise ValidationError(f"{path}: {value!r} does not match {schema['pattern']!r}")
+
+    if isinstance(value, (int, float)) and not isinstance(value, bool):
+        if "minimum" in schema and value < schema["minimum"]:
+            raise ValidationError(f"{path}: {value!r} is below minimum {schema['minimum']!r}")
+        if "maximum" in schema and value > schema["maximum"]:
+            raise ValidationError(f"{path}: {value!r} is above maximum {schema['maximum']!r}")
+
+
+def post_json(url: str, payload: dict[str, Any], timeout: float) -> dict[str, Any]:
+    data = json.dumps(payload, separators=(",", ":")).encode("utf-8")
+    req = urllib.request.Request(
+        url,
+        data=data,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError as exc:
+        raw = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"HTTP {exc.code}: {raw[:1000]}") from exc
+    except urllib.error.URLError as exc:
+        raise RuntimeError(str(exc)) from exc
+    try:
+        body = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(f"invalid JSON response: {raw[:1000]}") from exc
+    if isinstance(body, dict) and body.get("error"):
+        raise RuntimeError(f"API error: {body['error']!r}")
+    return body
+
+
+def chat_payload(model: str, case: Case, json_object_schema: bool) -> dict[str, Any]:
+    response_format: dict[str, Any]
+    if case.json_object:
+        response_format = {"type": "json_object"}
+        if json_object_schema:
+            response_format["schema"] = {"type": "object"}
+    else:
+        response_format = {
+            "type": "json_schema",
+            "json_schema": {
+                "name": case.name,
+                "strict": True,
+                "schema": case.schema,
+            },
+        }
+    return {
+        "model": model,
+        "messages": [{"role": "user", "content": case.prompt}],
+        "max_tokens": 256,
+        "temperature": 0,
+        "response_format": response_format,
+    }
+
+
+def responses_payload(model: str, case: Case, json_object_schema: bool) -> dict[str, Any]:
+    fmt: dict[str, Any]
+    if case.json_object:
+        fmt = {"type": "json_object"}
+        if json_object_schema:
+            fmt["schema"] = {"type": "object"}
+    else:
+        fmt = {
+            "type": "json_schema",
+            "name": case.name,
+            "strict": True,
+            "schema": case.schema,
+        }
+    return {
+        "model": model,
+        "input": case.prompt,
+        "max_output_tokens": 256,
+        "temperature": 0,
+        "text": {"format": fmt},
+    }
+
+
+def extract_chat_text(body: dict[str, Any]) -> str:
+    choices = body.get("choices")
+    if not isinstance(choices, list) or not choices:
+        raise RuntimeError(f"missing choices in chat response: {body!r}")
+    message = choices[0].get("message", {})
+    content = message.get("content")
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        out: list[str] = []
+        for part in content:
+            if isinstance(part, dict) and isinstance(part.get("text"), str):
+                out.append(part["text"])
+        return "".join(out)
+    raise RuntimeError(f"missing text content in chat response: {body!r}")
+
+
+def extract_responses_text(body: dict[str, Any]) -> str:
+    if isinstance(body.get("output_text"), str):
+        return body["output_text"]
+    out: list[str] = []
+    for item in body.get("output", []):
+        if not isinstance(item, dict):
+            continue
+        if item.get("type") == "message":
+            for part in item.get("content", []):
+                if isinstance(part, dict) and isinstance(part.get("text"), str):
+                    out.append(part["text"])
+    if out:
+        return "".join(out)
+    raise RuntimeError(f"missing output text in responses response: {body!r}")
+
+
+def check_case(
+    api: str,
+    base_url: str,
+    model: str,
+    case: Case,
+    timeout: float,
+    json_object_schema: bool,
+) -> str:
+    if api == "chat":
+        body = post_json(
+            f"{base_url}/chat/completions",
+            chat_payload(model, case, json_object_schema),
+            timeout,
+        )
+        text = extract_chat_text(body)
+    elif api == "responses":
+        body = post_json(
+            f"{base_url}/responses",
+            responses_payload(model, case, json_object_schema),
+            timeout,
+        )
+        text = extract_responses_text(body)
+    else:
+        raise RuntimeError(f"unknown api {api!r}")
+
+    try:
+        value = json.loads(text.strip())
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(f"{api}/{case.name}: output is not JSON: {text!r}") from exc
+    if not isinstance(value, dict):
+        raise RuntimeError(f"{api}/{case.name}: output is not a JSON object: {value!r}")
+    if case.schema is not None:
+        validate_schema(value, case.schema)
+    return json.dumps(value, ensure_ascii=False, sort_keys=True)
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--base-url", required=True, help="Base URL, usually http://host:port/v1")
+    p.add_argument("--model", required=True)
+    p.add_argument("--apis", default="chat,responses", help="Comma-separated: chat,responses")
+    p.add_argument("--case", action="append", help="Run only this case name; may repeat")
+    p.add_argument("--repeat", type=int, default=1)
+    p.add_argument("--timeout", type=float, default=120.0)
+    p.add_argument(
+        "--json-object-schema",
+        action="store_true",
+        help="Send {'type':'object'} with json_object mode for servers that require a concrete schema.",
+    )
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    base_url = args.base_url.rstrip("/")
+    apis = [x.strip() for x in args.apis.split(",") if x.strip()]
+    selected = set(args.case or [])
+    cases = [c for c in CASES if not selected or c.name in selected]
+    missing = selected - {c.name for c in CASES}
+    if missing:
+        print(f"unknown case(s): {', '.join(sorted(missing))}", file=sys.stderr)
+        return 2
+
+    failures = 0
+    for repeat in range(args.repeat):
+        for api in apis:
+            for case in cases:
+                label = f"{api}/{case.name}"
+                if args.repeat > 1:
+                    label = f"{label}#{repeat + 1}"
+                t0 = time.time()
+                try:
+                    value = check_case(
+                        api,
+                        base_url,
+                        args.model,
+                        case,
+                        args.timeout,
+                        args.json_object_schema,
+                    )
+                    elapsed = time.time() - t0
+                    if args.verbose:
+                        print(f"PASS {label} {elapsed:.2f}s {value}")
+                    else:
+                        print(f"PASS {label} {elapsed:.2f}s")
+                except Exception as exc:
+                    failures += 1
+                    print(f"FAIL {label}: {exc}", file=sys.stderr)
+    return 1 if failures else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 08e053d31d93d46eae67fa26478e9435fba858b5 Mon Sep 17 00:00:00 2001
From: fry69 <142489379+fry69@users.noreply.github.com>
Date: Fri, 29 May 2026 20:43:36 +0200
Subject: [PATCH 2/9] build: add $(DS4_LLGUIDANCE_DEPS) prerequisite to all
 binary targets

Ensure ds4, ds4-server, ds4-bench, ds4-eval, ds4-agent, and cpu
targets depend on libllguidance.a when LLGUIDANCE=1, so that
`cargo build` runs before linking. Previously only ds4-server
triggered the build via ds4_llguidance.o, causing other binaries
to fail linking against a nonexistent library.
---
 Makefile | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index 9cad30654..41631d33f 100644
--- a/Makefile
+++ b/Makefile
@@ -71,22 +71,22 @@ help:
 	@echo "  make test         Build and run tests"
 	@echo "  make clean        Remove build outputs"
 
-ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
+ds4: ds4_cli.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o $@ ds4_cli.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-ds4-server: ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS)
+ds4-server: ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o $@ ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(METAL_LDLIBS)
 
-ds4-bench: ds4_bench.o $(CORE_OBJS)
+ds4-bench: ds4_bench.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o $@ ds4_bench.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-ds4-eval: ds4_eval.o $(CORE_OBJS)
+ds4-eval: ds4_eval.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o $@ ds4_eval.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
+ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o $@ ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS)
+cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -122,22 +122,22 @@ cuda:
 	fi
 	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent CUDA_ARCH="$(CUDA_ARCH)"
 
-ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
+ds4: ds4_cli.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4-server: ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS)
+ds4-server: ds4_server.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4-bench: ds4_bench.o $(CORE_OBJS)
+ds4-bench: ds4_bench.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4-eval: ds4_eval.o $(CORE_OBJS)
+ds4-eval: ds4_eval.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS)
+ds4-agent: ds4_agent.o ds4_web.o ds4_kvstore.o linenoise.o $(CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS)
+cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o ds4_eval_cpu.o ds4_agent_cpu.o ds4_web.o ds4_kvstore.o linenoise.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(DS4_LLGUIDANCE_DEPS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o ds4_kvstore.o rax.o $(SERVER_EXTRA_OBJS) $(CPU_CORE_OBJS) $(LDLIBS)
 	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)

From 96c24df2662158bc4a51a09ef003b90cf855d56e Mon Sep 17 00:00:00 2001
From: fry69 <142489379+fry69@users.noreply.github.com>
Date: Fri, 29 May 2026 20:51:18 +0200
Subject: [PATCH 3/9] build: add distclean target to remove .deps

Introduce a distclean target that runs clean and then removes the
.deps directory (cloned llguidance source + Rust build artifacts).
This avoids forcing a re-clone on every `make clean` while giving
users an explicit way to fully reset when needed.
---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 41631d33f..d14d1009c 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ CUDA_LDLIBS += $(LLGUIDANCE_LDLIBS)
 METAL_LDLIBS := $(LDLIBS)
 endif
 
-.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
+.PHONY: all help clean distclean test cpu cuda cuda-spark cuda-generic cuda-regression
 
 ifeq ($(UNAME_S),Darwin)
 all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
@@ -241,3 +241,6 @@ test: ds4_test ds4-eval
 
 clean:
 	rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+
+distclean: clean
+	rm -rf .deps

From afab720363c295fb147333a5b38e6145e291ed64 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Fri, 29 May 2026 22:52:25 +0100
Subject: [PATCH 4/9] build: make llguidance path explicit

---
 Makefile  | 8 ++------
 README.md | 4 ++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index d14d1009c..76b492b3a 100644
--- a/Makefile
+++ b/Makefile
@@ -14,19 +14,15 @@ OBJCFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -fo
 LDLIBS ?= -lm -pthread
 METAL_SRCS := $(wildcard metal/*.metal)
 LLGUIDANCE ?= 0
+LLGUIDANCE_DIR ?= .deps/llguidance
 LLGUIDANCE_REPO ?= https://github.com/guidance-ai/llguidance
 LLGUIDANCE_TAG ?= v1.7.5
 SERVER_EXTRA_OBJS := ds4_llguidance.o
 
 ifeq ($(LLGUIDANCE),1)
-ifeq ($(strip $(LLGUIDANCE_DIR)),)
-ifneq ($(wildcard ../../llguidance/parser/llguidance.h),)
-LLGUIDANCE_DIR := ../../llguidance
-else
-LLGUIDANCE_DIR := .deps/llguidance
+ifeq ($(LLGUIDANCE_DIR),.deps/llguidance)
 LLGUIDANCE_NEEDS_CLONE := 1
 endif
-endif
 LLGUIDANCE_LIB := $(LLGUIDANCE_DIR)/target/release/libllguidance.a
 LLGUIDANCE_LDLIBS := $(LLGUIDANCE_LIB)
 ifneq ($(UNAME_S),Darwin)
diff --git a/README.md b/README.md
index a4aace8b5..26c8a7b8e 100644
--- a/README.md
+++ b/README.md
@@ -646,6 +646,10 @@ Structured outputs are available when the server is built with llguidance:
 make LLGUIDANCE=1
 ```
 
+By default, this clones llguidance into `.deps/llguidance` and builds the
+static library there. To use an existing checkout instead, pass
+`LLGUIDANCE_DIR=/path/to/llguidance`.
+
 With that build, `/v1/chat/completions` supports
 `response_format.type=json_schema` and `response_format.type=json_object`;
 `/v1/responses` supports the same modes through `text.format`. Structured

From 50b8035e21793b6ae3aee11d990854db186fd385 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Sat, 30 May 2026 12:04:17 +0100
Subject: [PATCH 5/9] structured outputs: expose llguidance format types

Expose regex, Lark, and llguidance structured-output formats through the existing Chat Completions and Responses structured-output surfaces, reusing the current llguidance constrained decoder.
---
 README.md          |    8 +-
 ds4_llguidance.c   |   15 +-
 ds4_server.c       |  198 +++++++-
 stress-test-cli.py | 1086 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1294 insertions(+), 13 deletions(-)
 create mode 100755 stress-test-cli.py

diff --git a/README.md b/README.md
index 26c8a7b8e..6d55447ce 100644
--- a/README.md
+++ b/README.md
@@ -651,10 +651,10 @@ static library there. To use an existing checkout instead, pass
 `LLGUIDANCE_DIR=/path/to/llguidance`.
 
 With that build, `/v1/chat/completions` supports
-`response_format.type=json_schema` and `response_format.type=json_object`;
-`/v1/responses` supports the same modes through `text.format`. Structured
-outputs use constrained decoding, disable thinking for that turn, and currently
-cannot be combined with tools.
+`response_format.type=json_schema`, `json_object`, `regex`, `lark`, and
+`llguidance`; `/v1/responses` supports the same modes through `text.format`.
+Structured outputs use constrained decoding, disable thinking for that turn,
+and currently cannot be combined with tools.
 
 `/v1/messages` is the Anthropic-compatible endpoint used by Claude Code style
 clients. It accepts `system`, `messages`, `tools`, `tool_choice`, `max_tokens`,
diff --git a/ds4_llguidance.c b/ds4_llguidance.c
index 1a39ea3e8..5c6854d2f 100644
--- a/ds4_llguidance.c
+++ b/ds4_llguidance.c
@@ -24,6 +24,7 @@ struct ds4_llguidance {
     size_t mask_words;
     int n_vocab;
     int eos_token;
+    bool deny_leading_ws;
     bool started;
 #else
     int unused;
@@ -109,6 +110,13 @@ static bool token_text_is_special(const char *p, size_t len) {
     return false;
 }
 
+static bool constraint_uses_json_leading_ws_rule(const char *constraint_type) {
+    return constraint_type &&
+        (!strcmp(constraint_type, "json") ||
+         !strcmp(constraint_type, "json_schema") ||
+         !strcmp(constraint_type, "json_object"));
+}
+
 static void bitset_set(uint32_t *mask, int token) {
     mask[(uint32_t)token / 32u] |= UINT32_C(1) << ((uint32_t)token & 31u);
 }
@@ -334,6 +342,8 @@ ds4_llguidance *ds4_llguidance_create(ds4_engine *e,
     g->mask_words = mask_bytes / sizeof(uint32_t);
     g->n_vocab = n_vocab;
     g->eos_token = ds4_token_eos(e);
+    g->deny_leading_ws =
+        constraint_uses_json_leading_ws_rule(constraint_type);
     g->started = false;
     return g;
 }
@@ -372,7 +382,8 @@ int ds4_llguidance_sample(ds4_llguidance *g,
 
     const uint32_t *deny = NULL;
     size_t deny_words = 0;
-    if (!g->started &&
+    if (g->deny_leading_ws &&
+        !g->started &&
         mask_has_non_denied_token(allow, g->mask_words, g->leading_ws_mask,
                                   g->leading_ws_words, g->n_vocab))
     {
@@ -399,7 +410,7 @@ bool ds4_llguidance_accept(ds4_llguidance *g,
                 llg_matcher_get_error(g->matcher));
         return false;
     }
-    if (!g->started && e) {
+    if (g->deny_leading_ws && !g->started && e) {
         size_t len = 0;
         char *piece = ds4_token_text(e, token, &len);
         if (bytes_have_non_json_ws(piece, len)) g->started = true;
diff --git a/ds4_server.c b/ds4_server.c
index add158ca4..de9acbc03 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -407,6 +407,9 @@ typedef enum {
     DS4_TEXT_FORMAT_TEXT,
     DS4_TEXT_FORMAT_JSON_OBJECT,
     DS4_TEXT_FORMAT_JSON_SCHEMA,
+    DS4_TEXT_FORMAT_REGEX,
+    DS4_TEXT_FORMAT_LARK,
+    DS4_TEXT_FORMAT_LLGUIDANCE,
 } ds4_text_format_type;
 
 typedef struct {
@@ -423,9 +426,12 @@ static void ds4_text_format_clear(ds4_text_format *f) {
     memset(f, 0, sizeof(*f));
 }
 
-static bool ds4_text_format_is_json(const ds4_text_format *f) {
+static bool ds4_text_format_is_structured(const ds4_text_format *f) {
     return f && (f->type == DS4_TEXT_FORMAT_JSON_OBJECT ||
-                 f->type == DS4_TEXT_FORMAT_JSON_SCHEMA);
+                 f->type == DS4_TEXT_FORMAT_JSON_SCHEMA ||
+                 f->type == DS4_TEXT_FORMAT_REGEX ||
+                 f->type == DS4_TEXT_FORMAT_LARK ||
+                 f->type == DS4_TEXT_FORMAT_LLGUIDANCE);
 }
 
 static void ds4_text_format_set_schema(ds4_text_format *f,
@@ -440,12 +446,21 @@ static void ds4_text_format_set_schema(ds4_text_format *f,
     f->strict = strict;
 }
 
+static void ds4_text_format_set_constraint(ds4_text_format *f,
+                                           ds4_text_format_type type,
+                                           char *constraint_data) {
+    ds4_text_format_set_schema(f, type, NULL, constraint_data, false);
+}
+
 static const char *ds4_text_format_constraint_type(const ds4_text_format *f) {
     if (!f) return "text";
     if (f->type == DS4_TEXT_FORMAT_JSON_SCHEMA) return "json_schema";
     if (f->type == DS4_TEXT_FORMAT_JSON_OBJECT) {
         return f->schema_json ? "json_schema" : "json_object";
     }
+    if (f->type == DS4_TEXT_FORMAT_REGEX) return "regex";
+    if (f->type == DS4_TEXT_FORMAT_LARK) return "lark";
+    if (f->type == DS4_TEXT_FORMAT_LLGUIDANCE) return "llguidance";
     return "text";
 }
 
@@ -457,7 +472,7 @@ static bool ds4_text_format_validate_with_llguidance(ds4_engine *e,
                                                      const ds4_text_format *f,
                                                      char *err,
                                                      size_t errlen) {
-    if (!ds4_text_format_is_json(f)) return true;
+    if (!ds4_text_format_is_structured(f)) return true;
     if (!ds4_llguidance_available()) {
         snprintf(err, errlen,
                  "structured outputs require building ds4 with LLGUIDANCE=1");
@@ -472,7 +487,7 @@ static bool ds4_text_format_validate_with_llguidance(ds4_engine *e,
         llg_err,
         sizeof(llg_err));
     if (!g) {
-        snprintf(err, errlen, "invalid structured output schema: %s",
+        snprintf(err, errlen, "invalid structured output constraint: %s",
                  llg_err[0] ? llg_err : "llguidance rejected constraint");
         return false;
     }
@@ -556,6 +571,8 @@ static bool parse_chat_response_format(const char **p,
 
     char *type = NULL;
     char *schema = NULL;
+    char *regex = NULL;
+    char *grammar = NULL;
     char *name = NULL;
     bool strict = false;
     bool saw_json_schema = false;
@@ -587,6 +604,18 @@ static bool parse_chat_response_format(const char **p,
                 free(key);
                 goto bad;
             }
+        } else if (!strcmp(key, "regex")) {
+            free(regex);
+            if (!json_string(p, &regex)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "grammar")) {
+            free(grammar);
+            if (!json_string(p, &grammar)) {
+                free(key);
+                goto bad;
+            }
         } else if (!strcmp(key, "name")) {
             free(name);
             if (!json_string(p, &name)) {
@@ -632,6 +661,27 @@ static bool parse_chat_response_format(const char **p,
             snprintf(err, errlen, "response_format json_schema.schema is required");
             goto bad_keep_err;
         }
+    } else if (!strcmp(type, "regex")) {
+        if (!regex) {
+            snprintf(err, errlen, "response_format.regex is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_REGEX, regex);
+        regex = NULL;
+    } else if (!strcmp(type, "lark")) {
+        if (!grammar) {
+            snprintf(err, errlen, "response_format.grammar is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LARK, grammar);
+        grammar = NULL;
+    } else if (!strcmp(type, "llguidance")) {
+        if (!grammar) {
+            snprintf(err, errlen, "response_format.grammar is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LLGUIDANCE, grammar);
+        grammar = NULL;
     } else {
         snprintf(err, errlen, "response_format.type=%s not supported", type);
         goto bad_keep_err;
@@ -640,6 +690,8 @@ static bool parse_chat_response_format(const char **p,
     free(type);
     free(name);
     free(schema);
+    free(regex);
+    free(grammar);
     return true;
 bad:
     snprintf(err, errlen, "invalid response_format");
@@ -647,6 +699,8 @@ static bool parse_chat_response_format(const char **p,
     free(type);
     free(name);
     free(schema);
+    free(regex);
+    free(grammar);
     return false;
 }
 
@@ -664,6 +718,8 @@ static bool parse_responses_text_format_object(const char **p,
     char *type = NULL;
     char *name = NULL;
     char *schema = NULL;
+    char *regex = NULL;
+    char *grammar = NULL;
     bool strict = false;
     json_ws(p);
     while (**p && **p != '}') {
@@ -693,6 +749,18 @@ static bool parse_responses_text_format_object(const char **p,
                 free(key);
                 goto bad;
             }
+        } else if (!strcmp(key, "regex")) {
+            free(regex);
+            if (!json_string(p, &regex)) {
+                free(key);
+                goto bad;
+            }
+        } else if (!strcmp(key, "grammar")) {
+            free(grammar);
+            if (!json_string(p, &grammar)) {
+                free(key);
+                goto bad;
+            }
         } else if (!strcmp(key, "strict")) {
             if (!json_bool(p, &strict)) {
                 free(key);
@@ -731,6 +799,27 @@ static bool parse_responses_text_format_object(const char **p,
                                    name, schema, strict);
         name = NULL;
         schema = NULL;
+    } else if (!strcmp(type, "regex")) {
+        if (!regex) {
+            snprintf(err, errlen, "text.format.regex is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_REGEX, regex);
+        regex = NULL;
+    } else if (!strcmp(type, "lark")) {
+        if (!grammar) {
+            snprintf(err, errlen, "text.format.grammar is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LARK, grammar);
+        grammar = NULL;
+    } else if (!strcmp(type, "llguidance")) {
+        if (!grammar) {
+            snprintf(err, errlen, "text.format.grammar is required");
+            goto bad_keep_err;
+        }
+        ds4_text_format_set_constraint(format, DS4_TEXT_FORMAT_LLGUIDANCE, grammar);
+        grammar = NULL;
     } else {
         snprintf(err, errlen, "text.format.type=%s not supported", type);
         goto bad_keep_err;
@@ -739,6 +828,8 @@ static bool parse_responses_text_format_object(const char **p,
     free(type);
     free(name);
     free(schema);
+    free(regex);
+    free(grammar);
     return true;
 bad:
     snprintf(err, errlen, "invalid text.format");
@@ -746,6 +837,8 @@ static bool parse_responses_text_format_object(const char **p,
     free(type);
     free(name);
     free(schema);
+    free(regex);
+    free(grammar);
     return false;
 }
 
@@ -3164,7 +3257,7 @@ static bool parse_chat_request(ds4_engine *e, server *s, const char *body, int d
         return false;
     }
     r->has_tools = tool_schemas && tool_schemas[0] && !tool_choice_none;
-    if (ds4_text_format_is_json(&r->text_format)) {
+    if (ds4_text_format_is_structured(&r->text_format)) {
         if (r->has_tools) {
             snprintf(err, errlen,
                      "structured outputs with tools are not supported");
@@ -4332,7 +4425,7 @@ static bool parse_responses_request(ds4_engine *e, server *s, const char *body,
         (!tool_choice_none && combined_tool_schemas.len) ?
         combined_tool_schemas.ptr : NULL;
     r->has_tools = active_tool_schemas && active_tool_schemas[0];
-    if (ds4_text_format_is_json(&r->text_format)) {
+    if (ds4_text_format_is_structured(&r->text_format)) {
         if (r->has_tools) {
             snprintf(err, errlen,
                      "structured outputs with tools are not supported");
@@ -6421,7 +6514,8 @@ static bool request_uses_structured_stream(const request *r) {
 }
 
 static bool request_uses_structured_decoder(const request *r) {
-    return r && r->kind == REQ_CHAT && ds4_text_format_is_json(&r->text_format);
+    return r && r->kind == REQ_CHAT &&
+        ds4_text_format_is_structured(&r->text_format);
 }
 
 /* Codex' Responses API uses 24-hex suffixes for response/item ids. Prefix
@@ -12305,6 +12399,50 @@ static void test_parse_chat_response_format_json_object(void) {
     ds4_text_format_clear(&fmt);
 }
 
+static void test_parse_chat_response_format_llguidance_extensions(void) {
+    const struct {
+        const char *json;
+        ds4_text_format_type type;
+        const char *constraint_type;
+        const char *needle;
+    } cases[] = {
+        {
+            "{\"type\":\"regex\",\"regex\":\"INV-[0-9]{4}\"}",
+            DS4_TEXT_FORMAT_REGEX,
+            "regex",
+            "INV-"
+        },
+        {
+            "{\"type\":\"lark\",\"grammar\":\"%llguidance {}\\nstart: /OK/\"}",
+            DS4_TEXT_FORMAT_LARK,
+            "lark",
+            "start:"
+        },
+        {
+            "{\"type\":\"llguidance\",\"grammar\":\"{\\\"grammars\\\":[]}\"}",
+            DS4_TEXT_FORMAT_LLGUIDANCE,
+            "llguidance",
+            "grammars"
+        },
+    };
+
+    for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+        const char *p = cases[i].json;
+        ds4_text_format fmt = {0};
+        char err[160] = {0};
+
+        TEST_ASSERT(parse_chat_response_format(&p, &fmt, err, sizeof(err)));
+        TEST_ASSERT(fmt.type == cases[i].type);
+        TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt),
+                            cases[i].constraint_type));
+        TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, cases[i].needle));
+        json_ws(&p);
+        TEST_ASSERT(*p == '\0');
+
+        ds4_text_format_clear(&fmt);
+    }
+}
+
 static void test_parse_chat_response_format_rejects_missing_schema(void) {
     const char *json = "{\"type\":\"json_schema\",\"json_schema\":{\"name\":\"bad\"}}";
     const char *p = json;
@@ -12354,6 +12492,50 @@ static void test_parse_responses_text_format_json_object(void) {
     ds4_text_format_clear(&fmt);
 }
 
+static void test_parse_responses_text_format_llguidance_extensions(void) {
+    const struct {
+        const char *json;
+        ds4_text_format_type type;
+        const char *constraint_type;
+        const char *needle;
+    } cases[] = {
+        {
+            "{\"format\":{\"type\":\"regex\",\"regex\":\"INV-[0-9]{4}\"}}",
+            DS4_TEXT_FORMAT_REGEX,
+            "regex",
+            "INV-"
+        },
+        {
+            "{\"format\":{\"type\":\"lark\",\"grammar\":\"%llguidance {}\\nstart: /OK/\"}}",
+            DS4_TEXT_FORMAT_LARK,
+            "lark",
+            "start:"
+        },
+        {
+            "{\"format\":{\"type\":\"llguidance\",\"grammar\":\"{\\\"grammars\\\":[]}\"}}",
+            DS4_TEXT_FORMAT_LLGUIDANCE,
+            "llguidance",
+            "grammars"
+        },
+    };
+
+    for (size_t i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+        const char *p = cases[i].json;
+        ds4_text_format fmt = {0};
+        char err[160] = {0};
+
+        TEST_ASSERT(parse_responses_text_value(&p, &fmt, err, sizeof(err)));
+        TEST_ASSERT(fmt.type == cases[i].type);
+        TEST_ASSERT(!strcmp(ds4_text_format_constraint_type(&fmt),
+                            cases[i].constraint_type));
+        TEST_ASSERT(fmt.schema_json && strstr(fmt.schema_json, cases[i].needle));
+        json_ws(&p);
+        TEST_ASSERT(*p == '\0');
+
+        ds4_text_format_clear(&fmt);
+    }
+}
+
 static void test_parse_responses_text_format_rejects_unknown_type(void) {
     const char *json = "{\"format\":{\"type\":\"xml\"}}";
     const char *p = json;
@@ -16168,9 +16350,11 @@ static void ds4_server_unit_tests_run(void) {
     test_render_chat_prompt_text_renders_tools_before_system();
     test_parse_chat_response_format_json_schema();
     test_parse_chat_response_format_json_object();
+    test_parse_chat_response_format_llguidance_extensions();
     test_parse_chat_response_format_rejects_missing_schema();
     test_parse_responses_text_format_json_schema();
     test_parse_responses_text_format_json_object();
+    test_parse_responses_text_format_llguidance_extensions();
     test_parse_responses_text_format_rejects_unknown_type();
     test_parse_responses_text_format_text_is_noop();
     test_tool_schema_order_from_anthropic_schema();
diff --git a/stress-test-cli.py b/stress-test-cli.py
new file mode 100755
index 000000000..af853496d
--- /dev/null
+++ b/stress-test-cli.py
@@ -0,0 +1,1086 @@
+#!/usr/bin/env python3
+"""Stress structured-output decoding across ds4 and llama.cpp servers.
+
+The OpenAI-compatible surfaces standardize JSON Schema structured outputs and
+JSON mode. llguidance itself supports a wider set of grammar tags: JSON Schema,
+JSON object, regex, Lark, and the internal guidance grammar-list wire format.
+
+This script keeps those layers explicit:
+
+* ds4 is exercised through /v1/chat/completions and /v1/responses with the
+  json_schema/json_object request shapes and ds4's llguidance extension types.
+* llama.cpp is exercised with the same OpenAI-compatible JSON cases and, for
+  broader llguidance grammar-family cases, with llama.cpp's top-level grammar
+  request extension.
+* Unsupported target/API/case combinations are reported as SKIP by default. Use
+  --strict-skips to make them fail the run, or --force-extensions to send
+  experimental non-OpenAI response_format types to targets that do not expose
+  them by default.
+
+Examples:
+  python3 stress-test-cli.py
+
+  python3 stress-test-cli.py --start never \
+      --ds4-base-url http://127.0.0.1:8000/v1 \
+      --llama-base-url http://127.0.0.1:8080/v1
+
+  python3 stress-test-cli.py --targets llama --families regex,lark,llguidance \
+      --llama-hf-model unsloth/Qwen3.5-9B-GGUF:Q8_0
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import math
+import os
+import re
+import shlex
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable
+
+
+DEFAULT_DS4_BASE_URL = "http://127.0.0.1:8000/v1"
+DEFAULT_LLAMA_BASE_URL = "http://127.0.0.1:8080/v1"
+DEFAULT_LLAMA_HF_MODEL = "unsloth/Qwen3.5-9B-GGUF:Q8_0"
+
+
+class ValidationError(Exception):
+    pass
+
+
+class UnsupportedCase(Exception):
+    pass
+
+
+Validator = Callable[[str], str]
+
+
+@dataclass(frozen=True)
+class Case:
+    name: str
+    family: str
+    prompt: str
+    validator: Validator
+    schema: dict[str, Any] | None = None
+    data: str = ""
+    llama_grammar: str | None = None
+    oracle_sample: str | None = None
+    max_tokens: int = 192
+
+
+@dataclass
+class Target:
+    name: str
+    base_url: str
+    model: str
+    command: list[str] | None
+    cwd: Path
+    supports_response_format_extensions: bool
+    supports_grammar_extension: bool
+    process: subprocess.Popen[str] | None = None
+    log_path: Path | None = None
+    started_by_us: bool = False
+
+
+@dataclass
+class Counts:
+    passed: int = 0
+    failed: int = 0
+    skipped: int = 0
+
+
+def compact_json(value: Any) -> str:
+    return json.dumps(value, ensure_ascii=False, separators=(",", ":"))
+
+
+def type_matches(value: Any, typ: str) -> bool:
+    if typ == "object":
+        return isinstance(value, dict)
+    if typ == "array":
+        return isinstance(value, list)
+    if typ == "string":
+        return isinstance(value, str)
+    if typ == "integer":
+        return isinstance(value, int) and not isinstance(value, bool)
+    if typ == "number":
+        return (isinstance(value, int) or isinstance(value, float)) and not isinstance(value, bool)
+    if typ == "boolean":
+        return isinstance(value, bool)
+    if typ == "null":
+        return value is None
+    return True
+
+
+def _validate_format(value: str, fmt: str, path: str) -> None:
+    if fmt == "date":
+        try:
+            _dt.date.fromisoformat(value)
+        except ValueError as exc:
+            raise ValidationError(f"{path}: expected RFC3339 date, got {value!r}") from exc
+    elif fmt == "time":
+        try:
+            _dt.time.fromisoformat(value.replace("Z", "+00:00"))
+        except ValueError as exc:
+            raise ValidationError(f"{path}: expected RFC3339 time, got {value!r}") from exc
+    elif fmt == "date-time":
+        try:
+            _dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
+        except ValueError as exc:
+            raise ValidationError(f"{path}: expected RFC3339 date-time, got {value!r}") from exc
+    elif fmt == "email":
+        if re.fullmatch(r"[^@\s]+@[^@\s]+\.[^@\s]+", value) is None:
+            raise ValidationError(f"{path}: expected email, got {value!r}")
+    elif fmt == "uuid":
+        if re.fullmatch(
+            r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-"
+            r"[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}",
+            value,
+        ) is None:
+            raise ValidationError(f"{path}: expected uuid, got {value!r}")
+
+
+def validate_schema(value: Any, schema: dict[str, Any], path: str = "$") -> None:
+    if "allOf" in schema:
+        for option in schema["allOf"]:
+            validate_schema(value, option, path)
+
+    if "anyOf" in schema:
+        errors: list[str] = []
+        for option in schema["anyOf"]:
+            try:
+                validate_schema(value, option, path)
+                return
+            except ValidationError as exc:
+                errors.append(str(exc))
+        raise ValidationError(f"{path}: did not match anyOf: {'; '.join(errors)}")
+
+    if "oneOf" in schema:
+        matches = 0
+        errors: list[str] = []
+        for option in schema["oneOf"]:
+            try:
+                validate_schema(value, option, path)
+                matches += 1
+            except ValidationError as exc:
+                errors.append(str(exc))
+        if matches != 1:
+            raise ValidationError(f"{path}: expected exactly one oneOf match, got {matches}: {'; '.join(errors)}")
+        return
+
+    if "const" in schema and value != schema["const"]:
+        raise ValidationError(f"{path}: expected const {schema['const']!r}, got {value!r}")
+    if "enum" in schema and value not in schema["enum"]:
+        raise ValidationError(f"{path}: expected one of {schema['enum']!r}, got {value!r}")
+
+    typ = schema.get("type")
+    if isinstance(typ, list):
+        if not any(type_matches(value, t) for t in typ):
+            raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ!r}")
+    elif isinstance(typ, str) and not type_matches(value, typ):
+        raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ!r}")
+
+    if typ == "object" or "properties" in schema:
+        if not isinstance(value, dict):
+            raise ValidationError(f"{path}: expected object")
+        props = schema.get("properties", {})
+        for key in schema.get("required", []):
+            if key not in value:
+                raise ValidationError(f"{path}: missing required property {key!r}")
+        min_props = schema.get("minProperties")
+        max_props = schema.get("maxProperties")
+        if min_props is not None and len(value) < min_props:
+            raise ValidationError(f"{path}: expected at least {min_props} properties")
+        if max_props is not None and len(value) > max_props:
+            raise ValidationError(f"{path}: expected at most {max_props} properties")
+        if schema.get("additionalProperties") is False:
+            extra = sorted(set(value) - set(props))
+            if extra:
+                raise ValidationError(f"{path}: extra properties {extra!r}")
+        for key, sub in props.items():
+            if key in value:
+                validate_schema(value[key], sub, f"{path}.{key}")
+
+    if typ == "array" or "items" in schema or "prefixItems" in schema:
+        if not isinstance(value, list):
+            raise ValidationError(f"{path}: expected array")
+        min_items = schema.get("minItems")
+        max_items = schema.get("maxItems")
+        if min_items is not None and len(value) < min_items:
+            raise ValidationError(f"{path}: expected at least {min_items} items")
+        if max_items is not None and len(value) > max_items:
+            raise ValidationError(f"{path}: expected at most {max_items} items")
+        prefix_items = schema.get("prefixItems")
+        if isinstance(prefix_items, list):
+            for i, sub in enumerate(prefix_items):
+                if i < len(value):
+                    validate_schema(value[i], sub, f"{path}[{i}]")
+        items = schema.get("items")
+        if isinstance(items, dict):
+            start = len(prefix_items) if isinstance(prefix_items, list) else 0
+            for i, item in enumerate(value[start:], start):
+                validate_schema(item, items, f"{path}[{i}]")
+
+    if isinstance(value, str):
+        if "minLength" in schema and len(value) < schema["minLength"]:
+            raise ValidationError(f"{path}: string shorter than minLength {schema['minLength']}")
+        if "maxLength" in schema and len(value) > schema["maxLength"]:
+            raise ValidationError(f"{path}: string longer than maxLength {schema['maxLength']}")
+        if "pattern" in schema and re.fullmatch(schema["pattern"], value) is None:
+            raise ValidationError(f"{path}: {value!r} does not match {schema['pattern']!r}")
+        if "format" in schema:
+            _validate_format(value, schema["format"], path)
+
+    if isinstance(value, (int, float)) and not isinstance(value, bool):
+        if "minimum" in schema and value < schema["minimum"]:
+            raise ValidationError(f"{path}: {value!r} is below minimum {schema['minimum']!r}")
+        if "maximum" in schema and value > schema["maximum"]:
+            raise ValidationError(f"{path}: {value!r} is above maximum {schema['maximum']!r}")
+        if "exclusiveMinimum" in schema and value <= schema["exclusiveMinimum"]:
+            raise ValidationError(f"{path}: {value!r} is not above exclusiveMinimum {schema['exclusiveMinimum']!r}")
+        if "exclusiveMaximum" in schema and value >= schema["exclusiveMaximum"]:
+            raise ValidationError(f"{path}: {value!r} is not below exclusiveMaximum {schema['exclusiveMaximum']!r}")
+        if "multipleOf" in schema:
+            q = value / schema["multipleOf"]
+            if not math.isclose(q, round(q), rel_tol=0.0, abs_tol=1e-9):
+                raise ValidationError(f"{path}: {value!r} is not a multiple of {schema['multipleOf']!r}")
+
+
+def parse_json_strict(text: str) -> Any:
+    stripped = text.strip()
+    try:
+        return json.loads(stripped)
+    except json.JSONDecodeError as exc:
+        raise ValidationError(f"output is not JSON: {text!r}") from exc
+
+
+def json_schema_validator(schema: dict[str, Any]) -> Validator:
+    def _validate(text: str) -> str:
+        value = parse_json_strict(text)
+        validate_schema(value, schema)
+        return compact_json(value)
+
+    return _validate
+
+
+def json_object_validator(text: str) -> str:
+    value = parse_json_strict(text)
+    if not isinstance(value, dict):
+        raise ValidationError(f"output is not a JSON object: {value!r}")
+    return compact_json(value)
+
+
+def regex_validator(pattern: str) -> Validator:
+    rx = re.compile(pattern)
+
+    def _validate(text: str) -> str:
+        value = text.strip()
+        if rx.fullmatch(value) is None:
+            raise ValidationError(f"{value!r} does not match /{pattern}/")
+        return value
+
+    return _validate
+
+
+def choice_validator(choices: set[str]) -> Validator:
+    def _validate(text: str) -> str:
+        value = text.strip()
+        if value not in choices:
+            raise ValidationError(f"{value!r} is not one of {sorted(choices)!r}")
+        return value
+
+    return _validate
+
+
+def permutation_validator(chars: str) -> Validator:
+    expected = sorted(chars)
+
+    def _validate(text: str) -> str:
+        value = text.strip()
+        if sorted(value) != expected or len(value) != len(chars):
+            raise ValidationError(f"{value!r} is not a permutation of {chars!r}")
+        return value
+
+    return _validate
+
+
+def substring_chunk_validator(prefix: str, words: list[str]) -> Validator:
+    allowed = {""}
+    for i in range(len(words)):
+        for j in range(i + 1, len(words) + 1):
+            allowed.add(" ".join(words[i:j]))
+
+    def _validate(text: str) -> str:
+        value = text.strip()
+        if not value.startswith(prefix):
+            raise ValidationError(f"{value!r} does not start with {prefix!r}")
+        tail = value[len(prefix):]
+        if tail not in allowed:
+            raise ValidationError(f"{tail!r} is not an allowed contiguous word substring")
+        return value
+
+    return _validate
+
+
+def make_cases() -> list[Case]:
+    calendar_schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string", "minLength": 1, "maxLength": 80},
+            "date": {"type": "string", "format": "date"},
+            "participants": {
+                "type": "array",
+                "items": {"type": "string", "minLength": 1},
+                "minItems": 1,
+                "maxItems": 5,
+            },
+        },
+        "required": ["name", "date", "participants"],
+        "additionalProperties": False,
+    }
+    status_schema = {
+        "type": "object",
+        "properties": {
+            "status": {"const": "ok"},
+            "priority": {"type": "string", "enum": ["low", "medium", "high"]},
+            "retry_count": {"type": "integer", "minimum": 0, "maximum": 5},
+            "active": {"type": "boolean"},
+        },
+        "required": ["status", "priority", "retry_count", "active"],
+        "additionalProperties": False,
+    }
+    ticket_schema = {
+        "type": "object",
+        "properties": {
+            "id": {"type": "string", "pattern": "TCK-[0-9]{3}"},
+            "owner": {"anyOf": [{"type": "string", "minLength": 1}, {"type": "null"}]},
+            "priority": {"oneOf": [{"const": "low"}, {"const": "medium"}, {"const": "high"}]},
+            "contact": {"type": "string", "format": "email"},
+        },
+        "required": ["id", "owner", "priority", "contact"],
+        "additionalProperties": False,
+    }
+    reading_schema = {
+        "type": "object",
+        "properties": {
+            "reading": {
+                "type": "array",
+                "prefixItems": [
+                    {"const": "temperature_c"},
+                    {"type": "number", "minimum": -40, "exclusiveMaximum": 80, "multipleOf": 0.5},
+                ],
+                "minItems": 2,
+                "maxItems": 2,
+            },
+            "tags": {
+                "type": "array",
+                "items": {"type": "string", "pattern": "[a-z]{3,8}"},
+                "minItems": 2,
+                "maxItems": 3,
+            },
+        },
+        "required": ["reading", "tags"],
+        "additionalProperties": False,
+    }
+
+    inline_json_schema = {
+        "type": "object",
+        "properties": {
+            "kind": {"const": "metric"},
+            "value": {"type": "integer", "minimum": 1, "maximum": 9},
+        },
+        "required": ["kind", "value"],
+        "additionalProperties": False,
+    }
+    inline_json_lark = f"""%llguidance {{}}
+start: %json {compact_json(inline_json_schema)}
+"""
+    choice_lark = """%llguidance {}
+start: "red" | "green" | "blue"
+"""
+    regex_lark = """%llguidance {}
+start: /INV-[0-9]{4}/
+"""
+    regex_ext_lark = """%llguidance {}
+start: "chunk:" %regex { "substring_words": "alpha beta gamma delta" }
+"""
+    parametric_lark = """%llguidance {}
+start: perm::0x0
+perm::_: ""                   %if is_ones([0:3])
+       | "a" perm::set_bit(0) %if bit_clear(0)
+       | "b" perm::set_bit(1) %if bit_clear(1)
+       | "c" perm::set_bit(2) %if bit_clear(2)
+"""
+    guidance_lark = """%llguidance {}
+start: "YES" | "NO"
+"""
+    guidance_wire = compact_json({"grammars": [{"lark_grammar": guidance_lark}]})
+
+    return [
+        Case(
+            name="json_schema_calendar",
+            family="json_schema",
+            prompt="Return one lunch calendar event for Alice and Bob on 2026-06-01. Return only JSON.",
+            schema=calendar_schema,
+            validator=json_schema_validator(calendar_schema),
+            data=compact_json(calendar_schema),
+            oracle_sample='{"name":"Lunch","date":"2026-06-01","participants":["Alice","Bob"]}',
+        ),
+        Case(
+            name="json_schema_status",
+            family="json_schema",
+            prompt="Return a compact health-check object with status ok. Return only JSON.",
+            schema=status_schema,
+            validator=json_schema_validator(status_schema),
+            data=compact_json(status_schema),
+            oracle_sample='{"status":"ok","priority":"medium","retry_count":2,"active":true}',
+        ),
+        Case(
+            name="json_schema_anyof_oneof_format",
+            family="json_schema",
+            prompt=(
+                "Return one support ticket with an id like TCK-123, an owner or null, "
+                "one priority, and a contact email. Return only JSON."
+            ),
+            schema=ticket_schema,
+            validator=json_schema_validator(ticket_schema),
+            data=compact_json(ticket_schema),
+            oracle_sample='{"id":"TCK-123","owner":null,"priority":"high","contact":"ops@example.com"}',
+        ),
+        Case(
+            name="json_schema_tuple_numeric",
+            family="json_schema",
+            prompt="Return one sensor reading tuple and two short lowercase tags. Return only JSON.",
+            schema=reading_schema,
+            validator=json_schema_validator(reading_schema),
+            data=compact_json(reading_schema),
+            oracle_sample='{"reading":["temperature_c",21.5],"tags":["lab","green"]}',
+        ),
+        Case(
+            name="json_object_mode",
+            family="json_object",
+            prompt="Return a JSON object with a tiny task description and whether it is done.",
+            validator=json_object_validator,
+            data="",
+            oracle_sample='{"task":"check","done":false}',
+        ),
+        Case(
+            name="regex_invoice_id",
+            family="regex",
+            prompt="Return exactly one invoice id in the form INV-0427. No quotes, no prose.",
+            validator=regex_validator(r"INV-[0-9]{4}"),
+            data=r"INV-[0-9]{4}",
+            llama_grammar=regex_lark,
+            oracle_sample="INV-0427",
+            max_tokens=32,
+        ),
+        Case(
+            name="lark_choice",
+            family="lark",
+            prompt="Return exactly one lowercase color: red, green, or blue. No quotes, no prose.",
+            validator=choice_validator({"red", "green", "blue"}),
+            data=choice_lark,
+            llama_grammar=choice_lark,
+            oracle_sample="green",
+            max_tokens=16,
+        ),
+        Case(
+            name="lark_inline_json",
+            family="lark",
+            prompt='Return a compact JSON object with kind "metric" and a small integer value.',
+            validator=json_schema_validator(inline_json_schema),
+            data=inline_json_lark,
+            llama_grammar=inline_json_lark,
+            oracle_sample='{"kind":"metric","value":7}',
+            max_tokens=64,
+        ),
+        Case(
+            name="lark_regex_ext_substring",
+            family="lark",
+            prompt="Return exactly chunk:beta gamma. No quotes, no prose.",
+            validator=substring_chunk_validator("chunk:", ["alpha", "beta", "gamma", "delta"]),
+            data=regex_ext_lark,
+            llama_grammar=regex_ext_lark,
+            oracle_sample="chunk:beta gamma",
+            max_tokens=32,
+        ),
+        Case(
+            name="lark_parametric_permutation",
+            family="lark",
+            prompt="Return exactly one permutation of the letters a, b, and c. No separators, no prose.",
+            validator=permutation_validator("abc"),
+            data=parametric_lark,
+            llama_grammar=parametric_lark,
+            oracle_sample="cab",
+            max_tokens=16,
+        ),
+        Case(
+            name="llguidance_internal_wire",
+            family="llguidance",
+            prompt="Return exactly YES or NO in uppercase. No punctuation, no prose.",
+            validator=choice_validator({"YES", "NO"}),
+            data=guidance_wire,
+            llama_grammar=guidance_lark,
+            oracle_sample="YES",
+            max_tokens=16,
+        ),
+    ]
+
+
+def post_json(url: str, payload: dict[str, Any], timeout: float, api_key: str | None = None) -> dict[str, Any]:
+    data = json.dumps(payload, separators=(",", ":")).encode("utf-8")
+    headers = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    req = urllib.request.Request(url, data=data, headers=headers, method="POST")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError as exc:
+        raw = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"HTTP {exc.code}: {raw[:1600]}") from exc
+    except urllib.error.URLError as exc:
+        raise RuntimeError(str(exc)) from exc
+    try:
+        body = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(f"invalid JSON response: {raw[:1600]}") from exc
+    if isinstance(body, dict) and body.get("error"):
+        raise RuntimeError(f"API error: {body['error']!r}")
+    return body
+
+
+def get_status(url: str, timeout: float) -> tuple[int | None, str]:
+    req = urllib.request.Request(url, method="GET")
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return resp.status, resp.read(512).decode("utf-8", errors="replace")
+    except urllib.error.HTTPError as exc:
+        return exc.code, exc.read(512).decode("utf-8", errors="replace")
+    except urllib.error.URLError as exc:
+        return None, str(exc)
+
+
+def extract_chat_text(body: dict[str, Any]) -> str:
+    choices = body.get("choices")
+    if not isinstance(choices, list) or not choices:
+        raise RuntimeError(f"missing choices in chat response: {body!r}")
+    message = choices[0].get("message", {})
+    content = message.get("content")
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for part in content:
+            if isinstance(part, dict):
+                if isinstance(part.get("text"), str):
+                    parts.append(part["text"])
+                elif isinstance(part.get("content"), str):
+                    parts.append(part["content"])
+        if parts:
+            return "".join(parts)
+    raise RuntimeError(f"missing text content in chat response: {body!r}")
+
+
+def extract_responses_text(body: dict[str, Any]) -> str:
+    if isinstance(body.get("output_text"), str):
+        return body["output_text"]
+    parts: list[str] = []
+    for item in body.get("output", []):
+        if not isinstance(item, dict):
+            continue
+        if item.get("type") == "message":
+            for part in item.get("content", []):
+                if isinstance(part, dict) and isinstance(part.get("text"), str):
+                    parts.append(part["text"])
+    if parts:
+        return "".join(parts)
+    raise RuntimeError(f"missing output text in responses response: {body!r}")
+
+
+def response_format_for_case(case: Case, target: Target, api: str, args: argparse.Namespace) -> dict[str, Any]:
+    if case.family == "json_object":
+        fmt: dict[str, Any] = {"type": "json_object"}
+        if args.json_object_schema:
+            fmt["schema"] = {"type": "object"}
+        return fmt
+
+    if case.family == "json_schema":
+        if not case.schema:
+            raise RuntimeError(f"{case.name}: missing schema")
+        if api == "chat":
+            if target.name == "llama" and args.llama_chat_schema_style == "flat":
+                return {
+                    "type": "json_schema",
+                    "name": case.name,
+                    "strict": True,
+                    "schema": case.schema,
+                }
+            return {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": case.name,
+                    "strict": True,
+                    "schema": case.schema,
+                },
+            }
+        return {
+            "type": "json_schema",
+            "name": case.name,
+            "strict": True,
+            "schema": case.schema,
+        }
+
+    if not args.force_extensions and not target.supports_response_format_extensions:
+        raise UnsupportedCase(f"{target.name} does not expose {case.family!r} through OpenAI response_format")
+
+    if case.family == "regex":
+        return {"type": "regex", "regex": case.data}
+    if case.family == "lark":
+        return {"type": "lark", "grammar": case.data}
+    if case.family == "llguidance":
+        return {"type": "llguidance", "grammar": case.data}
+    raise UnsupportedCase(f"unknown structured-output family {case.family!r}")
+
+
+def add_llama_common_payload_fields(payload: dict[str, Any], target: Target, args: argparse.Namespace) -> None:
+    if target.name != "llama":
+        return
+    if args.llama_disable_thinking:
+        payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
+    if args.seed is not None:
+        payload["seed"] = args.seed
+
+
+def chat_payload(target: Target, case: Case, args: argparse.Namespace) -> dict[str, Any]:
+    payload: dict[str, Any] = {
+        "model": target.model,
+        "messages": [{"role": "user", "content": case.prompt}],
+        "max_tokens": case.max_tokens,
+        "temperature": 0,
+    }
+    if (case.family in {"json_schema", "json_object"} or
+            args.force_extensions or
+            target.supports_response_format_extensions):
+        payload["response_format"] = response_format_for_case(case, target, "chat", args)
+    elif target.supports_grammar_extension and case.llama_grammar:
+        payload["grammar"] = case.llama_grammar
+    else:
+        raise UnsupportedCase(f"{target.name}/chat cannot carry {case.family!r}")
+    if args.seed is not None:
+        payload["seed"] = args.seed
+    add_llama_common_payload_fields(payload, target, args)
+    return payload
+
+
+def responses_payload(target: Target, case: Case, args: argparse.Namespace) -> dict[str, Any]:
+    payload: dict[str, Any] = {
+        "model": target.model,
+        "input": case.prompt,
+        "max_output_tokens": case.max_tokens,
+        "temperature": 0,
+    }
+    if (case.family in {"json_schema", "json_object"} or
+            args.force_extensions or
+            target.supports_response_format_extensions):
+        payload["text"] = {"format": response_format_for_case(case, target, "responses", args)}
+    elif target.supports_grammar_extension and case.llama_grammar:
+        payload["grammar"] = case.llama_grammar
+    else:
+        raise UnsupportedCase(f"{target.name}/responses cannot carry {case.family!r}")
+    if args.seed is not None:
+        payload["seed"] = args.seed
+    add_llama_common_payload_fields(payload, target, args)
+    return payload
+
+
+def check_case(target: Target, api: str, case: Case, args: argparse.Namespace) -> str:
+    if api == "chat":
+        payload = chat_payload(target, case, args)
+        body = post_json(f"{target.base_url}/chat/completions", payload, args.timeout, args.api_key)
+        text = extract_chat_text(body)
+    elif api == "responses":
+        payload = responses_payload(target, case, args)
+        body = post_json(f"{target.base_url}/responses", payload, args.timeout, args.api_key)
+        text = extract_responses_text(body)
+    else:
+        raise RuntimeError(f"unknown api {api!r}")
+    return case.validator(text)
+
+
+def split_csv(value: str) -> list[str]:
+    return [x.strip() for x in value.split(",") if x.strip()]
+
+
+def flatten_extra_args(values: list[str] | None) -> list[str]:
+    out: list[str] = []
+    for value in values or []:
+        out.extend(shlex.split(value))
+    return out
+
+
+def base_root(base_url: str) -> str:
+    if base_url.endswith("/v1"):
+        return base_url[:-3]
+    return base_url.rstrip("/")
+
+
+def port_from_url(base_url: str, default: int) -> int:
+    parsed = urllib.parse.urlparse(base_url)
+    if parsed.port:
+        return parsed.port
+    if parsed.scheme == "https":
+        return 443
+    if parsed.scheme == "http":
+        return 80
+    return default
+
+
+def target_is_ready(target: Target, timeout: float = 2.0) -> bool:
+    for url in (f"{base_root(target.base_url)}/health", f"{target.base_url}/models"):
+        status, _body = get_status(url, timeout)
+        if status == 200:
+            return True
+    return False
+
+
+def wait_ready(target: Target, startup_timeout: float) -> None:
+    deadline = time.time() + startup_timeout
+    last = ""
+    while time.time() < deadline:
+        if target.process and target.process.poll() is not None:
+            log_hint = f" log={target.log_path}" if target.log_path else ""
+            raise RuntimeError(f"{target.name} server exited with code {target.process.returncode}.{log_hint}")
+        for url in (f"{base_root(target.base_url)}/health", f"{target.base_url}/models"):
+            status, body = get_status(url, 2.0)
+            last = f"{url}: {status} {body[:200]}"
+            if status == 200:
+                return
+        time.sleep(1.0)
+    log_hint = f" log={target.log_path}" if target.log_path else ""
+    raise RuntimeError(f"{target.name} did not become ready within {startup_timeout:.0f}s ({last}).{log_hint}")
+
+
+def start_target_if_needed(target: Target, args: argparse.Namespace) -> None:
+    already_ready = target_is_ready(target)
+    if args.start == "never":
+        if not already_ready:
+            raise RuntimeError(f"{target.name} is not reachable at {target.base_url} and --start=never was set")
+        return
+    if already_ready and args.start != "always":
+        return
+    if not target.command:
+        raise RuntimeError(f"{target.name} has no launch command")
+
+    log_file = tempfile.NamedTemporaryFile(
+        prefix=f"stress-{target.name}-",
+        suffix=".log",
+        mode="w",
+        encoding="utf-8",
+        delete=False,
+    )
+    target.log_path = Path(log_file.name)
+    target.process = subprocess.Popen(
+        target.command,
+        cwd=str(target.cwd),
+        stdout=log_file,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
+    target.started_by_us = True
+    wait_ready(target, args.startup_timeout)
+
+
+def stop_target(target: Target) -> None:
+    if not target.process or target.process.poll() is not None:
+        return
+    target.process.terminate()
+    try:
+        target.process.wait(timeout=15)
+    except subprocess.TimeoutExpired:
+        target.process.kill()
+        target.process.wait(timeout=15)
+
+
+def build_ds4_command(args: argparse.Namespace) -> list[str]:
+    if args.ds4_cmd:
+        return shlex.split(args.ds4_cmd.format(port=port_from_url(args.ds4_base_url, 8000)))
+    return [
+        args.ds4_binary,
+        "--model",
+        args.ds4_model_path,
+        "--ctx",
+        str(args.server_ctx),
+        "--tokens",
+        str(args.server_default_tokens),
+        "--port",
+        str(port_from_url(args.ds4_base_url, 8000)),
+        *flatten_extra_args(args.ds4_extra_arg),
+    ]
+
+
+def build_llama_command(args: argparse.Namespace) -> list[str]:
+    if args.llama_cmd:
+        return shlex.split(args.llama_cmd.format(port=port_from_url(args.llama_base_url, 8080)))
+    cmd = [
+        args.llama_binary,
+        "-hf",
+        args.llama_hf_model,
+        "-c",
+        str(args.server_ctx),
+        "--port",
+        str(port_from_url(args.llama_base_url, 8080)),
+        "--jinja",
+    ]
+    if args.llama_ngl is not None:
+        cmd.extend(["-ngl", str(args.llama_ngl)])
+    cmd.extend(flatten_extra_args(args.llama_extra_arg))
+    return cmd
+
+
+def selected_cases(args: argparse.Namespace) -> list[Case]:
+    cases = make_cases()
+    families = set(split_csv(args.families)) if args.families != "all" else set()
+    names = set(args.case or [])
+    out = [
+        c for c in cases
+        if (not families or c.family in families) and (not names or c.name in names)
+    ]
+    missing = names - {c.name for c in cases}
+    if missing:
+        raise SystemExit(f"unknown case(s): {', '.join(sorted(missing))}")
+    known_families = {c.family for c in cases}
+    unknown_families = families - known_families
+    if unknown_families:
+        raise SystemExit(f"unknown family/families: {', '.join(sorted(unknown_families))}")
+    return out
+
+
+def run_llguidance_oracle(cases: list[Case], args: argparse.Namespace) -> Counts:
+    counts = Counts()
+    if args.oracle == "never":
+        return counts
+    try:
+        import llguidance  # type: ignore
+    except ModuleNotFoundError:
+        msg = "SKIP oracle/llguidance: python package is not importable"
+        if args.oracle == "require":
+            print(msg, file=sys.stderr)
+            counts.failed += 1
+        elif args.verbose:
+            print(msg)
+            counts.skipped += 1
+        return counts
+
+    try:
+        tok = llguidance.LLTokenizer("byte")
+    except Exception as exc:
+        msg = f"SKIP oracle/llguidance: failed to create byte tokenizer: {exc}"
+        if args.oracle == "require":
+            print(msg, file=sys.stderr)
+            counts.failed += 1
+        elif args.verbose:
+            print(msg)
+            counts.skipped += 1
+        return counts
+
+    for case in cases:
+        if case.family == "json_schema":
+            grammar = llguidance.LLMatcher.grammar_from_json_schema(case.schema)
+        elif case.family == "json_object":
+            grammar = llguidance.LLMatcher.grammar_from_json_schema({"type": "object"})
+        elif case.family == "regex":
+            grammar = llguidance.LLMatcher.grammar_from_regex(case.data)
+        elif case.family == "lark":
+            grammar = llguidance.LLMatcher.grammar_from_lark(case.data)
+        elif case.family == "llguidance":
+            grammar = llguidance.grammar_from("llguidance", case.data)
+        else:
+            counts.skipped += 1
+            continue
+
+        label = f"oracle/{case.family}/{case.name}"
+        try:
+            err = llguidance.LLMatcher.validate_grammar(grammar, tok)
+            if err:
+                raise RuntimeError(err)
+            if case.oracle_sample is not None:
+                matcher = llguidance.LLMatcher(tok, grammar)
+                for token in tok.tokenize_str(case.oracle_sample):
+                    bias = matcher.compute_logit_bias()
+                    if token >= len(bias) or bias[token] != 200:
+                        raise RuntimeError(f"sample token {token} is not allowed")
+                    if not matcher.consume_token(token):
+                        raise RuntimeError(f"sample token {token} was rejected")
+                if not matcher.is_accepting():
+                    raise RuntimeError("sample did not leave matcher in accepting state")
+            print(f"PASS {label}")
+            counts.passed += 1
+        except Exception as exc:
+            print(f"FAIL {label}: {exc}", file=sys.stderr)
+            counts.failed += 1
+            if args.fail_fast:
+                raise
+    return counts
+
+
+def run_target(target: Target, cases: list[Case], args: argparse.Namespace) -> Counts:
+    counts = Counts()
+    start_target_if_needed(target, args)
+    apis = split_csv(args.apis)
+    for repeat in range(args.repeat):
+        for api in apis:
+            for case in cases:
+                label = f"{target.name}/{api}/{case.family}/{case.name}"
+                if args.repeat > 1:
+                    label = f"{label}#{repeat + 1}"
+                t0 = time.time()
+                try:
+                    value = check_case(target, api, case, args)
+                    elapsed = time.time() - t0
+                    if args.verbose:
+                        print(f"PASS {label} {elapsed:.2f}s {value}")
+                    else:
+                        print(f"PASS {label} {elapsed:.2f}s")
+                    counts.passed += 1
+                except UnsupportedCase as exc:
+                    elapsed = time.time() - t0
+                    msg = f"SKIP {label} {elapsed:.2f}s: {exc}"
+                    if args.strict_skips:
+                        print(msg, file=sys.stderr)
+                        counts.failed += 1
+                        if args.fail_fast:
+                            raise
+                    else:
+                        print(msg)
+                        counts.skipped += 1
+                except Exception as exc:
+                    counts.failed += 1
+                    print(f"FAIL {label}: {exc}", file=sys.stderr)
+                    if args.fail_fast:
+                        raise
+    return counts
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--targets", default="ds4,llama", help="Comma-separated targets: ds4,llama")
+    p.add_argument("--apis", default="chat,responses", help="Comma-separated APIs: chat,responses")
+    p.add_argument("--families", default="all", help="Comma-separated families or all")
+    p.add_argument("--case", action="append", help="Run only this case name; may repeat")
+    p.add_argument("--list-cases", action="store_true", help="Print selected cases and exit")
+    p.add_argument("--repeat", type=int, default=1)
+    p.add_argument("--timeout", type=float, default=180.0)
+    p.add_argument("--startup-timeout", type=float, default=900.0)
+    p.add_argument("--start", choices=["missing", "never", "always"], default="missing")
+    p.add_argument("--stop-started", action=argparse.BooleanOptionalAction, default=True)
+    p.add_argument("--strict-skips", action="store_true", help="Treat unsupported matrix entries as failures")
+    p.add_argument("--force-extensions", action="store_true", help="Send regex/lark/llguidance as experimental response_format types")
+    p.add_argument("--fail-fast", action="store_true")
+    p.add_argument("--verbose", action="store_true")
+    p.add_argument("--api-key", help="Optional bearer token for non-local OpenAI-compatible servers")
+    p.add_argument("--seed", type=int, default=1)
+    p.add_argument("--json-object-schema", action="store_true", help="Attach {'type':'object'} to json_object mode")
+    p.add_argument("--oracle", choices=["auto", "never", "require"], default="auto", help="Local Python llguidance grammar validation")
+
+    p.add_argument("--server-ctx", type=int, default=8192)
+    p.add_argument("--server-default-tokens", type=int, default=384)
+
+    p.add_argument("--ds4-base-url", default=DEFAULT_DS4_BASE_URL)
+    p.add_argument("--ds4-model", default="ds4")
+    p.add_argument("--ds4-binary", default="./ds4-server")
+    p.add_argument("--ds4-model-path", default="ds4flash.gguf")
+    p.add_argument("--ds4-cmd", help="Override ds4 launch command; {port} is expanded")
+    p.add_argument("--ds4-extra-arg", action="append", help="Extra ds4-server args; may repeat")
+
+    p.add_argument("--llama-base-url", default=DEFAULT_LLAMA_BASE_URL)
+    p.add_argument("--llama-model", default=DEFAULT_LLAMA_HF_MODEL)
+    p.add_argument("--llama-binary", default="llama-server")
+    p.add_argument("--llama-hf-model", default=DEFAULT_LLAMA_HF_MODEL)
+    p.add_argument("--llama-cmd", help="Override llama launch command; {port} is expanded")
+    p.add_argument("--llama-extra-arg", action="append", help="Extra llama-server args; may repeat")
+    p.add_argument("--llama-ngl", type=int, default=999, help="llama.cpp GPU layers; set -1 to omit")
+    p.add_argument(
+        "--llama-chat-schema-style",
+        choices=["openai", "flat"],
+        default="flat",
+        help="json_schema shape for llama.cpp chat response_format",
+    )
+    p.add_argument("--llama-disable-thinking", action=argparse.BooleanOptionalAction, default=True)
+    return p.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    if args.llama_ngl is not None and args.llama_ngl < 0:
+        args.llama_ngl = None
+
+    repo = Path(__file__).resolve().parent
+    targets_requested = split_csv(args.targets)
+    unknown_targets = set(targets_requested) - {"ds4", "llama"}
+    if unknown_targets:
+        raise SystemExit(f"unknown target(s): {', '.join(sorted(unknown_targets))}")
+
+    cases = selected_cases(args)
+    if args.list_cases:
+        for case in cases:
+            print(f"{case.family}\t{case.name}")
+        return 0
+
+    total = Counts()
+
+    oracle_counts = run_llguidance_oracle(cases, args)
+    total.passed += oracle_counts.passed
+    total.failed += oracle_counts.failed
+    total.skipped += oracle_counts.skipped
+
+    target_map: dict[str, Target] = {
+        "ds4": Target(
+            name="ds4",
+            base_url=args.ds4_base_url.rstrip("/"),
+            model=args.ds4_model,
+            command=build_ds4_command(args),
+            cwd=repo,
+            supports_response_format_extensions=True,
+            supports_grammar_extension=False,
+        ),
+        "llama": Target(
+            name="llama",
+            base_url=args.llama_base_url.rstrip("/"),
+            model=args.llama_model,
+            command=build_llama_command(args),
+            cwd=repo,
+            supports_response_format_extensions=False,
+            supports_grammar_extension=True,
+        ),
+    }
+
+    try:
+        for name in targets_requested:
+            target = target_map[name]
+            counts = run_target(target, cases, args)
+            total.passed += counts.passed
+            total.failed += counts.failed
+            total.skipped += counts.skipped
+            if args.stop_started and target.started_by_us:
+                stop_target(target)
+    finally:
+        for target in target_map.values():
+            if args.stop_started and target.started_by_us:
+                stop_target(target)
+
+    print(f"SUMMARY pass={total.passed} fail={total.failed} skip={total.skipped}")
+    return 1 if total.failed else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From ce6524980cba8d4f8245db007f83732628309be8 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Sat, 30 May 2026 15:39:36 +0100
Subject: [PATCH 6/9] Remove stress test script from repo

---
 stress-test-cli.py | 1086 --------------------------------------------
 1 file changed, 1086 deletions(-)
 delete mode 100755 stress-test-cli.py

diff --git a/stress-test-cli.py b/stress-test-cli.py
deleted file mode 100755
index af853496d..000000000
--- a/stress-test-cli.py
+++ /dev/null
@@ -1,1086 +0,0 @@
-#!/usr/bin/env python3
-"""Stress structured-output decoding across ds4 and llama.cpp servers.
-
-The OpenAI-compatible surfaces standardize JSON Schema structured outputs and
-JSON mode. llguidance itself supports a wider set of grammar tags: JSON Schema,
-JSON object, regex, Lark, and the internal guidance grammar-list wire format.
-
-This script keeps those layers explicit:
-
-* ds4 is exercised through /v1/chat/completions and /v1/responses with the
-  json_schema/json_object request shapes and ds4's llguidance extension types.
-* llama.cpp is exercised with the same OpenAI-compatible JSON cases and, for
-  broader llguidance grammar-family cases, with llama.cpp's top-level grammar
-  request extension.
-* Unsupported target/API/case combinations are reported as SKIP by default. Use
-  --strict-skips to make them fail the run, or --force-extensions to send
-  experimental non-OpenAI response_format types to targets that do not expose
-  them by default.
-
-Examples:
-  python3 stress-test-cli.py
-
-  python3 stress-test-cli.py --start never \
-      --ds4-base-url http://127.0.0.1:8000/v1 \
-      --llama-base-url http://127.0.0.1:8080/v1
-
-  python3 stress-test-cli.py --targets llama --families regex,lark,llguidance \
-      --llama-hf-model unsloth/Qwen3.5-9B-GGUF:Q8_0
-"""
-
-from __future__ import annotations
-
-import argparse
-import datetime as _dt
-import json
-import math
-import os
-import re
-import shlex
-import subprocess
-import sys
-import tempfile
-import time
-import urllib.error
-import urllib.parse
-import urllib.request
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Callable
-
-
-DEFAULT_DS4_BASE_URL = "http://127.0.0.1:8000/v1"
-DEFAULT_LLAMA_BASE_URL = "http://127.0.0.1:8080/v1"
-DEFAULT_LLAMA_HF_MODEL = "unsloth/Qwen3.5-9B-GGUF:Q8_0"
-
-
-class ValidationError(Exception):
-    pass
-
-
-class UnsupportedCase(Exception):
-    pass
-
-
-Validator = Callable[[str], str]
-
-
-@dataclass(frozen=True)
-class Case:
-    name: str
-    family: str
-    prompt: str
-    validator: Validator
-    schema: dict[str, Any] | None = None
-    data: str = ""
-    llama_grammar: str | None = None
-    oracle_sample: str | None = None
-    max_tokens: int = 192
-
-
-@dataclass
-class Target:
-    name: str
-    base_url: str
-    model: str
-    command: list[str] | None
-    cwd: Path
-    supports_response_format_extensions: bool
-    supports_grammar_extension: bool
-    process: subprocess.Popen[str] | None = None
-    log_path: Path | None = None
-    started_by_us: bool = False
-
-
-@dataclass
-class Counts:
-    passed: int = 0
-    failed: int = 0
-    skipped: int = 0
-
-
-def compact_json(value: Any) -> str:
-    return json.dumps(value, ensure_ascii=False, separators=(",", ":"))
-
-
-def type_matches(value: Any, typ: str) -> bool:
-    if typ == "object":
-        return isinstance(value, dict)
-    if typ == "array":
-        return isinstance(value, list)
-    if typ == "string":
-        return isinstance(value, str)
-    if typ == "integer":
-        return isinstance(value, int) and not isinstance(value, bool)
-    if typ == "number":
-        return (isinstance(value, int) or isinstance(value, float)) and not isinstance(value, bool)
-    if typ == "boolean":
-        return isinstance(value, bool)
-    if typ == "null":
-        return value is None
-    return True
-
-
-def _validate_format(value: str, fmt: str, path: str) -> None:
-    if fmt == "date":
-        try:
-            _dt.date.fromisoformat(value)
-        except ValueError as exc:
-            raise ValidationError(f"{path}: expected RFC3339 date, got {value!r}") from exc
-    elif fmt == "time":
-        try:
-            _dt.time.fromisoformat(value.replace("Z", "+00:00"))
-        except ValueError as exc:
-            raise ValidationError(f"{path}: expected RFC3339 time, got {value!r}") from exc
-    elif fmt == "date-time":
-        try:
-            _dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
-        except ValueError as exc:
-            raise ValidationError(f"{path}: expected RFC3339 date-time, got {value!r}") from exc
-    elif fmt == "email":
-        if re.fullmatch(r"[^@\s]+@[^@\s]+\.[^@\s]+", value) is None:
-            raise ValidationError(f"{path}: expected email, got {value!r}")
-    elif fmt == "uuid":
-        if re.fullmatch(
-            r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-"
-            r"[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}",
-            value,
-        ) is None:
-            raise ValidationError(f"{path}: expected uuid, got {value!r}")
-
-
-def validate_schema(value: Any, schema: dict[str, Any], path: str = "$") -> None:
-    if "allOf" in schema:
-        for option in schema["allOf"]:
-            validate_schema(value, option, path)
-
-    if "anyOf" in schema:
-        errors: list[str] = []
-        for option in schema["anyOf"]:
-            try:
-                validate_schema(value, option, path)
-                return
-            except ValidationError as exc:
-                errors.append(str(exc))
-        raise ValidationError(f"{path}: did not match anyOf: {'; '.join(errors)}")
-
-    if "oneOf" in schema:
-        matches = 0
-        errors: list[str] = []
-        for option in schema["oneOf"]:
-            try:
-                validate_schema(value, option, path)
-                matches += 1
-            except ValidationError as exc:
-                errors.append(str(exc))
-        if matches != 1:
-            raise ValidationError(f"{path}: expected exactly one oneOf match, got {matches}: {'; '.join(errors)}")
-        return
-
-    if "const" in schema and value != schema["const"]:
-        raise ValidationError(f"{path}: expected const {schema['const']!r}, got {value!r}")
-    if "enum" in schema and value not in schema["enum"]:
-        raise ValidationError(f"{path}: expected one of {schema['enum']!r}, got {value!r}")
-
-    typ = schema.get("type")
-    if isinstance(typ, list):
-        if not any(type_matches(value, t) for t in typ):
-            raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ!r}")
-    elif isinstance(typ, str) and not type_matches(value, typ):
-        raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ!r}")
-
-    if typ == "object" or "properties" in schema:
-        if not isinstance(value, dict):
-            raise ValidationError(f"{path}: expected object")
-        props = schema.get("properties", {})
-        for key in schema.get("required", []):
-            if key not in value:
-                raise ValidationError(f"{path}: missing required property {key!r}")
-        min_props = schema.get("minProperties")
-        max_props = schema.get("maxProperties")
-        if min_props is not None and len(value) < min_props:
-            raise ValidationError(f"{path}: expected at least {min_props} properties")
-        if max_props is not None and len(value) > max_props:
-            raise ValidationError(f"{path}: expected at most {max_props} properties")
-        if schema.get("additionalProperties") is False:
-            extra = sorted(set(value) - set(props))
-            if extra:
-                raise ValidationError(f"{path}: extra properties {extra!r}")
-        for key, sub in props.items():
-            if key in value:
-                validate_schema(value[key], sub, f"{path}.{key}")
-
-    if typ == "array" or "items" in schema or "prefixItems" in schema:
-        if not isinstance(value, list):
-            raise ValidationError(f"{path}: expected array")
-        min_items = schema.get("minItems")
-        max_items = schema.get("maxItems")
-        if min_items is not None and len(value) < min_items:
-            raise ValidationError(f"{path}: expected at least {min_items} items")
-        if max_items is not None and len(value) > max_items:
-            raise ValidationError(f"{path}: expected at most {max_items} items")
-        prefix_items = schema.get("prefixItems")
-        if isinstance(prefix_items, list):
-            for i, sub in enumerate(prefix_items):
-                if i < len(value):
-                    validate_schema(value[i], sub, f"{path}[{i}]")
-        items = schema.get("items")
-        if isinstance(items, dict):
-            start = len(prefix_items) if isinstance(prefix_items, list) else 0
-            for i, item in enumerate(value[start:], start):
-                validate_schema(item, items, f"{path}[{i}]")
-
-    if isinstance(value, str):
-        if "minLength" in schema and len(value) < schema["minLength"]:
-            raise ValidationError(f"{path}: string shorter than minLength {schema['minLength']}")
-        if "maxLength" in schema and len(value) > schema["maxLength"]:
-            raise ValidationError(f"{path}: string longer than maxLength {schema['maxLength']}")
-        if "pattern" in schema and re.fullmatch(schema["pattern"], value) is None:
-            raise ValidationError(f"{path}: {value!r} does not match {schema['pattern']!r}")
-        if "format" in schema:
-            _validate_format(value, schema["format"], path)
-
-    if isinstance(value, (int, float)) and not isinstance(value, bool):
-        if "minimum" in schema and value < schema["minimum"]:
-            raise ValidationError(f"{path}: {value!r} is below minimum {schema['minimum']!r}")
-        if "maximum" in schema and value > schema["maximum"]:
-            raise ValidationError(f"{path}: {value!r} is above maximum {schema['maximum']!r}")
-        if "exclusiveMinimum" in schema and value <= schema["exclusiveMinimum"]:
-            raise ValidationError(f"{path}: {value!r} is not above exclusiveMinimum {schema['exclusiveMinimum']!r}")
-        if "exclusiveMaximum" in schema and value >= schema["exclusiveMaximum"]:
-            raise ValidationError(f"{path}: {value!r} is not below exclusiveMaximum {schema['exclusiveMaximum']!r}")
-        if "multipleOf" in schema:
-            q = value / schema["multipleOf"]
-            if not math.isclose(q, round(q), rel_tol=0.0, abs_tol=1e-9):
-                raise ValidationError(f"{path}: {value!r} is not a multiple of {schema['multipleOf']!r}")
-
-
-def parse_json_strict(text: str) -> Any:
-    stripped = text.strip()
-    try:
-        return json.loads(stripped)
-    except json.JSONDecodeError as exc:
-        raise ValidationError(f"output is not JSON: {text!r}") from exc
-
-
-def json_schema_validator(schema: dict[str, Any]) -> Validator:
-    def _validate(text: str) -> str:
-        value = parse_json_strict(text)
-        validate_schema(value, schema)
-        return compact_json(value)
-
-    return _validate
-
-
-def json_object_validator(text: str) -> str:
-    value = parse_json_strict(text)
-    if not isinstance(value, dict):
-        raise ValidationError(f"output is not a JSON object: {value!r}")
-    return compact_json(value)
-
-
-def regex_validator(pattern: str) -> Validator:
-    rx = re.compile(pattern)
-
-    def _validate(text: str) -> str:
-        value = text.strip()
-        if rx.fullmatch(value) is None:
-            raise ValidationError(f"{value!r} does not match /{pattern}/")
-        return value
-
-    return _validate
-
-
-def choice_validator(choices: set[str]) -> Validator:
-    def _validate(text: str) -> str:
-        value = text.strip()
-        if value not in choices:
-            raise ValidationError(f"{value!r} is not one of {sorted(choices)!r}")
-        return value
-
-    return _validate
-
-
-def permutation_validator(chars: str) -> Validator:
-    expected = sorted(chars)
-
-    def _validate(text: str) -> str:
-        value = text.strip()
-        if sorted(value) != expected or len(value) != len(chars):
-            raise ValidationError(f"{value!r} is not a permutation of {chars!r}")
-        return value
-
-    return _validate
-
-
-def substring_chunk_validator(prefix: str, words: list[str]) -> Validator:
-    allowed = {""}
-    for i in range(len(words)):
-        for j in range(i + 1, len(words) + 1):
-            allowed.add(" ".join(words[i:j]))
-
-    def _validate(text: str) -> str:
-        value = text.strip()
-        if not value.startswith(prefix):
-            raise ValidationError(f"{value!r} does not start with {prefix!r}")
-        tail = value[len(prefix):]
-        if tail not in allowed:
-            raise ValidationError(f"{tail!r} is not an allowed contiguous word substring")
-        return value
-
-    return _validate
-
-
-def make_cases() -> list[Case]:
-    calendar_schema = {
-        "type": "object",
-        "properties": {
-            "name": {"type": "string", "minLength": 1, "maxLength": 80},
-            "date": {"type": "string", "format": "date"},
-            "participants": {
-                "type": "array",
-                "items": {"type": "string", "minLength": 1},
-                "minItems": 1,
-                "maxItems": 5,
-            },
-        },
-        "required": ["name", "date", "participants"],
-        "additionalProperties": False,
-    }
-    status_schema = {
-        "type": "object",
-        "properties": {
-            "status": {"const": "ok"},
-            "priority": {"type": "string", "enum": ["low", "medium", "high"]},
-            "retry_count": {"type": "integer", "minimum": 0, "maximum": 5},
-            "active": {"type": "boolean"},
-        },
-        "required": ["status", "priority", "retry_count", "active"],
-        "additionalProperties": False,
-    }
-    ticket_schema = {
-        "type": "object",
-        "properties": {
-            "id": {"type": "string", "pattern": "TCK-[0-9]{3}"},
-            "owner": {"anyOf": [{"type": "string", "minLength": 1}, {"type": "null"}]},
-            "priority": {"oneOf": [{"const": "low"}, {"const": "medium"}, {"const": "high"}]},
-            "contact": {"type": "string", "format": "email"},
-        },
-        "required": ["id", "owner", "priority", "contact"],
-        "additionalProperties": False,
-    }
-    reading_schema = {
-        "type": "object",
-        "properties": {
-            "reading": {
-                "type": "array",
-                "prefixItems": [
-                    {"const": "temperature_c"},
-                    {"type": "number", "minimum": -40, "exclusiveMaximum": 80, "multipleOf": 0.5},
-                ],
-                "minItems": 2,
-                "maxItems": 2,
-            },
-            "tags": {
-                "type": "array",
-                "items": {"type": "string", "pattern": "[a-z]{3,8}"},
-                "minItems": 2,
-                "maxItems": 3,
-            },
-        },
-        "required": ["reading", "tags"],
-        "additionalProperties": False,
-    }
-
-    inline_json_schema = {
-        "type": "object",
-        "properties": {
-            "kind": {"const": "metric"},
-            "value": {"type": "integer", "minimum": 1, "maximum": 9},
-        },
-        "required": ["kind", "value"],
-        "additionalProperties": False,
-    }
-    inline_json_lark = f"""%llguidance {{}}
-start: %json {compact_json(inline_json_schema)}
-"""
-    choice_lark = """%llguidance {}
-start: "red" | "green" | "blue"
-"""
-    regex_lark = """%llguidance {}
-start: /INV-[0-9]{4}/
-"""
-    regex_ext_lark = """%llguidance {}
-start: "chunk:" %regex { "substring_words": "alpha beta gamma delta" }
-"""
-    parametric_lark = """%llguidance {}
-start: perm::0x0
-perm::_: ""                   %if is_ones([0:3])
-       | "a" perm::set_bit(0) %if bit_clear(0)
-       | "b" perm::set_bit(1) %if bit_clear(1)
-       | "c" perm::set_bit(2) %if bit_clear(2)
-"""
-    guidance_lark = """%llguidance {}
-start: "YES" | "NO"
-"""
-    guidance_wire = compact_json({"grammars": [{"lark_grammar": guidance_lark}]})
-
-    return [
-        Case(
-            name="json_schema_calendar",
-            family="json_schema",
-            prompt="Return one lunch calendar event for Alice and Bob on 2026-06-01. Return only JSON.",
-            schema=calendar_schema,
-            validator=json_schema_validator(calendar_schema),
-            data=compact_json(calendar_schema),
-            oracle_sample='{"name":"Lunch","date":"2026-06-01","participants":["Alice","Bob"]}',
-        ),
-        Case(
-            name="json_schema_status",
-            family="json_schema",
-            prompt="Return a compact health-check object with status ok. Return only JSON.",
-            schema=status_schema,
-            validator=json_schema_validator(status_schema),
-            data=compact_json(status_schema),
-            oracle_sample='{"status":"ok","priority":"medium","retry_count":2,"active":true}',
-        ),
-        Case(
-            name="json_schema_anyof_oneof_format",
-            family="json_schema",
-            prompt=(
-                "Return one support ticket with an id like TCK-123, an owner or null, "
-                "one priority, and a contact email. Return only JSON."
-            ),
-            schema=ticket_schema,
-            validator=json_schema_validator(ticket_schema),
-            data=compact_json(ticket_schema),
-            oracle_sample='{"id":"TCK-123","owner":null,"priority":"high","contact":"ops@example.com"}',
-        ),
-        Case(
-            name="json_schema_tuple_numeric",
-            family="json_schema",
-            prompt="Return one sensor reading tuple and two short lowercase tags. Return only JSON.",
-            schema=reading_schema,
-            validator=json_schema_validator(reading_schema),
-            data=compact_json(reading_schema),
-            oracle_sample='{"reading":["temperature_c",21.5],"tags":["lab","green"]}',
-        ),
-        Case(
-            name="json_object_mode",
-            family="json_object",
-            prompt="Return a JSON object with a tiny task description and whether it is done.",
-            validator=json_object_validator,
-            data="",
-            oracle_sample='{"task":"check","done":false}',
-        ),
-        Case(
-            name="regex_invoice_id",
-            family="regex",
-            prompt="Return exactly one invoice id in the form INV-0427. No quotes, no prose.",
-            validator=regex_validator(r"INV-[0-9]{4}"),
-            data=r"INV-[0-9]{4}",
-            llama_grammar=regex_lark,
-            oracle_sample="INV-0427",
-            max_tokens=32,
-        ),
-        Case(
-            name="lark_choice",
-            family="lark",
-            prompt="Return exactly one lowercase color: red, green, or blue. No quotes, no prose.",
-            validator=choice_validator({"red", "green", "blue"}),
-            data=choice_lark,
-            llama_grammar=choice_lark,
-            oracle_sample="green",
-            max_tokens=16,
-        ),
-        Case(
-            name="lark_inline_json",
-            family="lark",
-            prompt='Return a compact JSON object with kind "metric" and a small integer value.',
-            validator=json_schema_validator(inline_json_schema),
-            data=inline_json_lark,
-            llama_grammar=inline_json_lark,
-            oracle_sample='{"kind":"metric","value":7}',
-            max_tokens=64,
-        ),
-        Case(
-            name="lark_regex_ext_substring",
-            family="lark",
-            prompt="Return exactly chunk:beta gamma. No quotes, no prose.",
-            validator=substring_chunk_validator("chunk:", ["alpha", "beta", "gamma", "delta"]),
-            data=regex_ext_lark,
-            llama_grammar=regex_ext_lark,
-            oracle_sample="chunk:beta gamma",
-            max_tokens=32,
-        ),
-        Case(
-            name="lark_parametric_permutation",
-            family="lark",
-            prompt="Return exactly one permutation of the letters a, b, and c. No separators, no prose.",
-            validator=permutation_validator("abc"),
-            data=parametric_lark,
-            llama_grammar=parametric_lark,
-            oracle_sample="cab",
-            max_tokens=16,
-        ),
-        Case(
-            name="llguidance_internal_wire",
-            family="llguidance",
-            prompt="Return exactly YES or NO in uppercase. No punctuation, no prose.",
-            validator=choice_validator({"YES", "NO"}),
-            data=guidance_wire,
-            llama_grammar=guidance_lark,
-            oracle_sample="YES",
-            max_tokens=16,
-        ),
-    ]
-
-
-def post_json(url: str, payload: dict[str, Any], timeout: float, api_key: str | None = None) -> dict[str, Any]:
-    data = json.dumps(payload, separators=(",", ":")).encode("utf-8")
-    headers = {"Content-Type": "application/json"}
-    if api_key:
-        headers["Authorization"] = f"Bearer {api_key}"
-    req = urllib.request.Request(url, data=data, headers=headers, method="POST")
-    try:
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            raw = resp.read().decode("utf-8", errors="replace")
-    except urllib.error.HTTPError as exc:
-        raw = exc.read().decode("utf-8", errors="replace")
-        raise RuntimeError(f"HTTP {exc.code}: {raw[:1600]}") from exc
-    except urllib.error.URLError as exc:
-        raise RuntimeError(str(exc)) from exc
-    try:
-        body = json.loads(raw)
-    except json.JSONDecodeError as exc:
-        raise RuntimeError(f"invalid JSON response: {raw[:1600]}") from exc
-    if isinstance(body, dict) and body.get("error"):
-        raise RuntimeError(f"API error: {body['error']!r}")
-    return body
-
-
-def get_status(url: str, timeout: float) -> tuple[int | None, str]:
-    req = urllib.request.Request(url, method="GET")
-    try:
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            return resp.status, resp.read(512).decode("utf-8", errors="replace")
-    except urllib.error.HTTPError as exc:
-        return exc.code, exc.read(512).decode("utf-8", errors="replace")
-    except urllib.error.URLError as exc:
-        return None, str(exc)
-
-
-def extract_chat_text(body: dict[str, Any]) -> str:
-    choices = body.get("choices")
-    if not isinstance(choices, list) or not choices:
-        raise RuntimeError(f"missing choices in chat response: {body!r}")
-    message = choices[0].get("message", {})
-    content = message.get("content")
-    if isinstance(content, str):
-        return content
-    if isinstance(content, list):
-        parts: list[str] = []
-        for part in content:
-            if isinstance(part, dict):
-                if isinstance(part.get("text"), str):
-                    parts.append(part["text"])
-                elif isinstance(part.get("content"), str):
-                    parts.append(part["content"])
-        if parts:
-            return "".join(parts)
-    raise RuntimeError(f"missing text content in chat response: {body!r}")
-
-
-def extract_responses_text(body: dict[str, Any]) -> str:
-    if isinstance(body.get("output_text"), str):
-        return body["output_text"]
-    parts: list[str] = []
-    for item in body.get("output", []):
-        if not isinstance(item, dict):
-            continue
-        if item.get("type") == "message":
-            for part in item.get("content", []):
-                if isinstance(part, dict) and isinstance(part.get("text"), str):
-                    parts.append(part["text"])
-    if parts:
-        return "".join(parts)
-    raise RuntimeError(f"missing output text in responses response: {body!r}")
-
-
-def response_format_for_case(case: Case, target: Target, api: str, args: argparse.Namespace) -> dict[str, Any]:
-    if case.family == "json_object":
-        fmt: dict[str, Any] = {"type": "json_object"}
-        if args.json_object_schema:
-            fmt["schema"] = {"type": "object"}
-        return fmt
-
-    if case.family == "json_schema":
-        if not case.schema:
-            raise RuntimeError(f"{case.name}: missing schema")
-        if api == "chat":
-            if target.name == "llama" and args.llama_chat_schema_style == "flat":
-                return {
-                    "type": "json_schema",
-                    "name": case.name,
-                    "strict": True,
-                    "schema": case.schema,
-                }
-            return {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": case.name,
-                    "strict": True,
-                    "schema": case.schema,
-                },
-            }
-        return {
-            "type": "json_schema",
-            "name": case.name,
-            "strict": True,
-            "schema": case.schema,
-        }
-
-    if not args.force_extensions and not target.supports_response_format_extensions:
-        raise UnsupportedCase(f"{target.name} does not expose {case.family!r} through OpenAI response_format")
-
-    if case.family == "regex":
-        return {"type": "regex", "regex": case.data}
-    if case.family == "lark":
-        return {"type": "lark", "grammar": case.data}
-    if case.family == "llguidance":
-        return {"type": "llguidance", "grammar": case.data}
-    raise UnsupportedCase(f"unknown structured-output family {case.family!r}")
-
-
-def add_llama_common_payload_fields(payload: dict[str, Any], target: Target, args: argparse.Namespace) -> None:
-    if target.name != "llama":
-        return
-    if args.llama_disable_thinking:
-        payload.setdefault("chat_template_kwargs", {})["enable_thinking"] = False
-    if args.seed is not None:
-        payload["seed"] = args.seed
-
-
-def chat_payload(target: Target, case: Case, args: argparse.Namespace) -> dict[str, Any]:
-    payload: dict[str, Any] = {
-        "model": target.model,
-        "messages": [{"role": "user", "content": case.prompt}],
-        "max_tokens": case.max_tokens,
-        "temperature": 0,
-    }
-    if (case.family in {"json_schema", "json_object"} or
-            args.force_extensions or
-            target.supports_response_format_extensions):
-        payload["response_format"] = response_format_for_case(case, target, "chat", args)
-    elif target.supports_grammar_extension and case.llama_grammar:
-        payload["grammar"] = case.llama_grammar
-    else:
-        raise UnsupportedCase(f"{target.name}/chat cannot carry {case.family!r}")
-    if args.seed is not None:
-        payload["seed"] = args.seed
-    add_llama_common_payload_fields(payload, target, args)
-    return payload
-
-
-def responses_payload(target: Target, case: Case, args: argparse.Namespace) -> dict[str, Any]:
-    payload: dict[str, Any] = {
-        "model": target.model,
-        "input": case.prompt,
-        "max_output_tokens": case.max_tokens,
-        "temperature": 0,
-    }
-    if (case.family in {"json_schema", "json_object"} or
-            args.force_extensions or
-            target.supports_response_format_extensions):
-        payload["text"] = {"format": response_format_for_case(case, target, "responses", args)}
-    elif target.supports_grammar_extension and case.llama_grammar:
-        payload["grammar"] = case.llama_grammar
-    else:
-        raise UnsupportedCase(f"{target.name}/responses cannot carry {case.family!r}")
-    if args.seed is not None:
-        payload["seed"] = args.seed
-    add_llama_common_payload_fields(payload, target, args)
-    return payload
-
-
-def check_case(target: Target, api: str, case: Case, args: argparse.Namespace) -> str:
-    if api == "chat":
-        payload = chat_payload(target, case, args)
-        body = post_json(f"{target.base_url}/chat/completions", payload, args.timeout, args.api_key)
-        text = extract_chat_text(body)
-    elif api == "responses":
-        payload = responses_payload(target, case, args)
-        body = post_json(f"{target.base_url}/responses", payload, args.timeout, args.api_key)
-        text = extract_responses_text(body)
-    else:
-        raise RuntimeError(f"unknown api {api!r}")
-    return case.validator(text)
-
-
-def split_csv(value: str) -> list[str]:
-    return [x.strip() for x in value.split(",") if x.strip()]
-
-
-def flatten_extra_args(values: list[str] | None) -> list[str]:
-    out: list[str] = []
-    for value in values or []:
-        out.extend(shlex.split(value))
-    return out
-
-
-def base_root(base_url: str) -> str:
-    if base_url.endswith("/v1"):
-        return base_url[:-3]
-    return base_url.rstrip("/")
-
-
-def port_from_url(base_url: str, default: int) -> int:
-    parsed = urllib.parse.urlparse(base_url)
-    if parsed.port:
-        return parsed.port
-    if parsed.scheme == "https":
-        return 443
-    if parsed.scheme == "http":
-        return 80
-    return default
-
-
-def target_is_ready(target: Target, timeout: float = 2.0) -> bool:
-    for url in (f"{base_root(target.base_url)}/health", f"{target.base_url}/models"):
-        status, _body = get_status(url, timeout)
-        if status == 200:
-            return True
-    return False
-
-
-def wait_ready(target: Target, startup_timeout: float) -> None:
-    deadline = time.time() + startup_timeout
-    last = ""
-    while time.time() < deadline:
-        if target.process and target.process.poll() is not None:
-            log_hint = f" log={target.log_path}" if target.log_path else ""
-            raise RuntimeError(f"{target.name} server exited with code {target.process.returncode}.{log_hint}")
-        for url in (f"{base_root(target.base_url)}/health", f"{target.base_url}/models"):
-            status, body = get_status(url, 2.0)
-            last = f"{url}: {status} {body[:200]}"
-            if status == 200:
-                return
-        time.sleep(1.0)
-    log_hint = f" log={target.log_path}" if target.log_path else ""
-    raise RuntimeError(f"{target.name} did not become ready within {startup_timeout:.0f}s ({last}).{log_hint}")
-
-
-def start_target_if_needed(target: Target, args: argparse.Namespace) -> None:
-    already_ready = target_is_ready(target)
-    if args.start == "never":
-        if not already_ready:
-            raise RuntimeError(f"{target.name} is not reachable at {target.base_url} and --start=never was set")
-        return
-    if already_ready and args.start != "always":
-        return
-    if not target.command:
-        raise RuntimeError(f"{target.name} has no launch command")
-
-    log_file = tempfile.NamedTemporaryFile(
-        prefix=f"stress-{target.name}-",
-        suffix=".log",
-        mode="w",
-        encoding="utf-8",
-        delete=False,
-    )
-    target.log_path = Path(log_file.name)
-    target.process = subprocess.Popen(
-        target.command,
-        cwd=str(target.cwd),
-        stdout=log_file,
-        stderr=subprocess.STDOUT,
-        text=True,
-    )
-    target.started_by_us = True
-    wait_ready(target, args.startup_timeout)
-
-
-def stop_target(target: Target) -> None:
-    if not target.process or target.process.poll() is not None:
-        return
-    target.process.terminate()
-    try:
-        target.process.wait(timeout=15)
-    except subprocess.TimeoutExpired:
-        target.process.kill()
-        target.process.wait(timeout=15)
-
-
-def build_ds4_command(args: argparse.Namespace) -> list[str]:
-    if args.ds4_cmd:
-        return shlex.split(args.ds4_cmd.format(port=port_from_url(args.ds4_base_url, 8000)))
-    return [
-        args.ds4_binary,
-        "--model",
-        args.ds4_model_path,
-        "--ctx",
-        str(args.server_ctx),
-        "--tokens",
-        str(args.server_default_tokens),
-        "--port",
-        str(port_from_url(args.ds4_base_url, 8000)),
-        *flatten_extra_args(args.ds4_extra_arg),
-    ]
-
-
-def build_llama_command(args: argparse.Namespace) -> list[str]:
-    if args.llama_cmd:
-        return shlex.split(args.llama_cmd.format(port=port_from_url(args.llama_base_url, 8080)))
-    cmd = [
-        args.llama_binary,
-        "-hf",
-        args.llama_hf_model,
-        "-c",
-        str(args.server_ctx),
-        "--port",
-        str(port_from_url(args.llama_base_url, 8080)),
-        "--jinja",
-    ]
-    if args.llama_ngl is not None:
-        cmd.extend(["-ngl", str(args.llama_ngl)])
-    cmd.extend(flatten_extra_args(args.llama_extra_arg))
-    return cmd
-
-
-def selected_cases(args: argparse.Namespace) -> list[Case]:
-    cases = make_cases()
-    families = set(split_csv(args.families)) if args.families != "all" else set()
-    names = set(args.case or [])
-    out = [
-        c for c in cases
-        if (not families or c.family in families) and (not names or c.name in names)
-    ]
-    missing = names - {c.name for c in cases}
-    if missing:
-        raise SystemExit(f"unknown case(s): {', '.join(sorted(missing))}")
-    known_families = {c.family for c in cases}
-    unknown_families = families - known_families
-    if unknown_families:
-        raise SystemExit(f"unknown family/families: {', '.join(sorted(unknown_families))}")
-    return out
-
-
-def run_llguidance_oracle(cases: list[Case], args: argparse.Namespace) -> Counts:
-    counts = Counts()
-    if args.oracle == "never":
-        return counts
-    try:
-        import llguidance  # type: ignore
-    except ModuleNotFoundError:
-        msg = "SKIP oracle/llguidance: python package is not importable"
-        if args.oracle == "require":
-            print(msg, file=sys.stderr)
-            counts.failed += 1
-        elif args.verbose:
-            print(msg)
-            counts.skipped += 1
-        return counts
-
-    try:
-        tok = llguidance.LLTokenizer("byte")
-    except Exception as exc:
-        msg = f"SKIP oracle/llguidance: failed to create byte tokenizer: {exc}"
-        if args.oracle == "require":
-            print(msg, file=sys.stderr)
-            counts.failed += 1
-        elif args.verbose:
-            print(msg)
-            counts.skipped += 1
-        return counts
-
-    for case in cases:
-        if case.family == "json_schema":
-            grammar = llguidance.LLMatcher.grammar_from_json_schema(case.schema)
-        elif case.family == "json_object":
-            grammar = llguidance.LLMatcher.grammar_from_json_schema({"type": "object"})
-        elif case.family == "regex":
-            grammar = llguidance.LLMatcher.grammar_from_regex(case.data)
-        elif case.family == "lark":
-            grammar = llguidance.LLMatcher.grammar_from_lark(case.data)
-        elif case.family == "llguidance":
-            grammar = llguidance.grammar_from("llguidance", case.data)
-        else:
-            counts.skipped += 1
-            continue
-
-        label = f"oracle/{case.family}/{case.name}"
-        try:
-            err = llguidance.LLMatcher.validate_grammar(grammar, tok)
-            if err:
-                raise RuntimeError(err)
-            if case.oracle_sample is not None:
-                matcher = llguidance.LLMatcher(tok, grammar)
-                for token in tok.tokenize_str(case.oracle_sample):
-                    bias = matcher.compute_logit_bias()
-                    if token >= len(bias) or bias[token] != 200:
-                        raise RuntimeError(f"sample token {token} is not allowed")
-                    if not matcher.consume_token(token):
-                        raise RuntimeError(f"sample token {token} was rejected")
-                if not matcher.is_accepting():
-                    raise RuntimeError("sample did not leave matcher in accepting state")
-            print(f"PASS {label}")
-            counts.passed += 1
-        except Exception as exc:
-            print(f"FAIL {label}: {exc}", file=sys.stderr)
-            counts.failed += 1
-            if args.fail_fast:
-                raise
-    return counts
-
-
-def run_target(target: Target, cases: list[Case], args: argparse.Namespace) -> Counts:
-    counts = Counts()
-    start_target_if_needed(target, args)
-    apis = split_csv(args.apis)
-    for repeat in range(args.repeat):
-        for api in apis:
-            for case in cases:
-                label = f"{target.name}/{api}/{case.family}/{case.name}"
-                if args.repeat > 1:
-                    label = f"{label}#{repeat + 1}"
-                t0 = time.time()
-                try:
-                    value = check_case(target, api, case, args)
-                    elapsed = time.time() - t0
-                    if args.verbose:
-                        print(f"PASS {label} {elapsed:.2f}s {value}")
-                    else:
-                        print(f"PASS {label} {elapsed:.2f}s")
-                    counts.passed += 1
-                except UnsupportedCase as exc:
-                    elapsed = time.time() - t0
-                    msg = f"SKIP {label} {elapsed:.2f}s: {exc}"
-                    if args.strict_skips:
-                        print(msg, file=sys.stderr)
-                        counts.failed += 1
-                        if args.fail_fast:
-                            raise
-                    else:
-                        print(msg)
-                        counts.skipped += 1
-                except Exception as exc:
-                    counts.failed += 1
-                    print(f"FAIL {label}: {exc}", file=sys.stderr)
-                    if args.fail_fast:
-                        raise
-    return counts
-
-
-def parse_args() -> argparse.Namespace:
-    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
-    p.add_argument("--targets", default="ds4,llama", help="Comma-separated targets: ds4,llama")
-    p.add_argument("--apis", default="chat,responses", help="Comma-separated APIs: chat,responses")
-    p.add_argument("--families", default="all", help="Comma-separated families or all")
-    p.add_argument("--case", action="append", help="Run only this case name; may repeat")
-    p.add_argument("--list-cases", action="store_true", help="Print selected cases and exit")
-    p.add_argument("--repeat", type=int, default=1)
-    p.add_argument("--timeout", type=float, default=180.0)
-    p.add_argument("--startup-timeout", type=float, default=900.0)
-    p.add_argument("--start", choices=["missing", "never", "always"], default="missing")
-    p.add_argument("--stop-started", action=argparse.BooleanOptionalAction, default=True)
-    p.add_argument("--strict-skips", action="store_true", help="Treat unsupported matrix entries as failures")
-    p.add_argument("--force-extensions", action="store_true", help="Send regex/lark/llguidance as experimental response_format types")
-    p.add_argument("--fail-fast", action="store_true")
-    p.add_argument("--verbose", action="store_true")
-    p.add_argument("--api-key", help="Optional bearer token for non-local OpenAI-compatible servers")
-    p.add_argument("--seed", type=int, default=1)
-    p.add_argument("--json-object-schema", action="store_true", help="Attach {'type':'object'} to json_object mode")
-    p.add_argument("--oracle", choices=["auto", "never", "require"], default="auto", help="Local Python llguidance grammar validation")
-
-    p.add_argument("--server-ctx", type=int, default=8192)
-    p.add_argument("--server-default-tokens", type=int, default=384)
-
-    p.add_argument("--ds4-base-url", default=DEFAULT_DS4_BASE_URL)
-    p.add_argument("--ds4-model", default="ds4")
-    p.add_argument("--ds4-binary", default="./ds4-server")
-    p.add_argument("--ds4-model-path", default="ds4flash.gguf")
-    p.add_argument("--ds4-cmd", help="Override ds4 launch command; {port} is expanded")
-    p.add_argument("--ds4-extra-arg", action="append", help="Extra ds4-server args; may repeat")
-
-    p.add_argument("--llama-base-url", default=DEFAULT_LLAMA_BASE_URL)
-    p.add_argument("--llama-model", default=DEFAULT_LLAMA_HF_MODEL)
-    p.add_argument("--llama-binary", default="llama-server")
-    p.add_argument("--llama-hf-model", default=DEFAULT_LLAMA_HF_MODEL)
-    p.add_argument("--llama-cmd", help="Override llama launch command; {port} is expanded")
-    p.add_argument("--llama-extra-arg", action="append", help="Extra llama-server args; may repeat")
-    p.add_argument("--llama-ngl", type=int, default=999, help="llama.cpp GPU layers; set -1 to omit")
-    p.add_argument(
-        "--llama-chat-schema-style",
-        choices=["openai", "flat"],
-        default="flat",
-        help="json_schema shape for llama.cpp chat response_format",
-    )
-    p.add_argument("--llama-disable-thinking", action=argparse.BooleanOptionalAction, default=True)
-    return p.parse_args()
-
-
-def main() -> int:
-    args = parse_args()
-    if args.llama_ngl is not None and args.llama_ngl < 0:
-        args.llama_ngl = None
-
-    repo = Path(__file__).resolve().parent
-    targets_requested = split_csv(args.targets)
-    unknown_targets = set(targets_requested) - {"ds4", "llama"}
-    if unknown_targets:
-        raise SystemExit(f"unknown target(s): {', '.join(sorted(unknown_targets))}")
-
-    cases = selected_cases(args)
-    if args.list_cases:
-        for case in cases:
-            print(f"{case.family}\t{case.name}")
-        return 0
-
-    total = Counts()
-
-    oracle_counts = run_llguidance_oracle(cases, args)
-    total.passed += oracle_counts.passed
-    total.failed += oracle_counts.failed
-    total.skipped += oracle_counts.skipped
-
-    target_map: dict[str, Target] = {
-        "ds4": Target(
-            name="ds4",
-            base_url=args.ds4_base_url.rstrip("/"),
-            model=args.ds4_model,
-            command=build_ds4_command(args),
-            cwd=repo,
-            supports_response_format_extensions=True,
-            supports_grammar_extension=False,
-        ),
-        "llama": Target(
-            name="llama",
-            base_url=args.llama_base_url.rstrip("/"),
-            model=args.llama_model,
-            command=build_llama_command(args),
-            cwd=repo,
-            supports_response_format_extensions=False,
-            supports_grammar_extension=True,
-        ),
-    }
-
-    try:
-        for name in targets_requested:
-            target = target_map[name]
-            counts = run_target(target, cases, args)
-            total.passed += counts.passed
-            total.failed += counts.failed
-            total.skipped += counts.skipped
-            if args.stop_started and target.started_by_us:
-                stop_target(target)
-    finally:
-        for target in target_map.values():
-            if args.stop_started and target.started_by_us:
-                stop_target(target)
-
-    print(f"SUMMARY pass={total.passed} fail={total.failed} skip={total.skipped}")
-    return 1 if total.failed else 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())

From 234fbf771f0d8252b0d216c573fc0fa1bd776b3c Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Sat, 30 May 2026 15:49:25 +0100
Subject: [PATCH 7/9] Remove structured output stress test

---
 tests/structured_outputs_stress.py | 424 -----------------------------
 1 file changed, 424 deletions(-)
 delete mode 100755 tests/structured_outputs_stress.py

diff --git a/tests/structured_outputs_stress.py b/tests/structured_outputs_stress.py
deleted file mode 100755
index 9bc7610fc..000000000
--- a/tests/structured_outputs_stress.py
+++ /dev/null
@@ -1,424 +0,0 @@
-#!/usr/bin/env python3
-"""Stress JSON structured outputs on OpenAI-compatible chat/responses APIs.
-
-Examples:
-  python3 tests/structured_outputs_stress.py \
-      --base-url http://127.0.0.1:8000/v1 --model ds4 --apis chat,responses
-
-  python3 tests/structured_outputs_stress.py \
-      --base-url http://127.0.0.1:8080/v1 --model qwen --apis chat
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import re
-import sys
-import time
-import urllib.error
-import urllib.request
-from dataclasses import dataclass
-from typing import Any
-
-
-@dataclass(frozen=True)
-class Case:
-    name: str
-    prompt: str
-    schema: dict[str, Any] | None
-    json_object: bool = False
-
-
-CASES: list[Case] = [
-    Case(
-        name="calendar_event",
-        prompt=(
-            "Create one calendar event for Alice and Bob having lunch on "
-            "2026-06-01 at noon. Return only the requested JSON object."
-        ),
-        schema={
-            "type": "object",
-            "properties": {
-                "name": {"type": "string"},
-                "date": {"type": "string"},
-                "participants": {
-                    "type": "array",
-                    "items": {"type": "string"},
-                    "minItems": 1,
-                    "maxItems": 5,
-                },
-            },
-            "required": ["name", "date", "participants"],
-            "additionalProperties": False,
-        },
-    ),
-    Case(
-        name="enum_const_integer_boolean",
-        prompt=(
-            "Return a compact health-check result. Use status ok, one priority, "
-            "a retry count, and whether the system is active."
-        ),
-        schema={
-            "type": "object",
-            "properties": {
-                "status": {"const": "ok"},
-                "priority": {"type": "string", "enum": ["low", "medium", "high"]},
-                "retry_count": {"type": "integer", "minimum": 0, "maximum": 5},
-                "active": {"type": "boolean"},
-            },
-            "required": ["status", "priority", "retry_count", "active"],
-            "additionalProperties": False,
-        },
-    ),
-    Case(
-        name="nested_arrays",
-        prompt=(
-            "Return a 2 by 2 integer matrix and two short labels. Keep values "
-            "small and return only JSON."
-        ),
-        schema={
-            "type": "object",
-            "properties": {
-                "matrix": {
-                    "type": "array",
-                    "minItems": 2,
-                    "maxItems": 2,
-                    "items": {
-                        "type": "array",
-                        "minItems": 2,
-                        "maxItems": 2,
-                        "items": {"type": "integer", "minimum": -9, "maximum": 9},
-                    },
-                },
-                "labels": {
-                    "type": "array",
-                    "minItems": 2,
-                    "maxItems": 2,
-                    "items": {"type": "string"},
-                },
-            },
-            "required": ["matrix", "labels"],
-            "additionalProperties": False,
-        },
-    ),
-    Case(
-        name="nullable_anyof_number_bounds",
-        prompt=(
-            "Return a score between zero and one, and use either an owner name "
-            "or null if unknown."
-        ),
-        schema={
-            "type": "object",
-            "properties": {
-                "owner": {"anyOf": [{"type": "string"}, {"type": "null"}]},
-                "score": {"type": "number", "minimum": 0, "maximum": 1},
-            },
-            "required": ["owner", "score"],
-            "additionalProperties": False,
-        },
-    ),
-    Case(
-        name="pattern_string",
-        prompt="Return an inventory code in the form two uppercase letters, dash, three digits.",
-        schema={
-            "type": "object",
-            "properties": {
-                "code": {"type": "string", "pattern": "^[A-Z]{2}-[0-9]{3}$"}
-            },
-            "required": ["code"],
-            "additionalProperties": False,
-        },
-    ),
-    Case(
-        name="json_object_mode",
-        prompt="Return a JSON object with two fields describing a tiny task list.",
-        schema=None,
-        json_object=True,
-    ),
-]
-
-
-class ValidationError(Exception):
-    pass
-
-
-def type_matches(value: Any, typ: str) -> bool:
-    if typ == "object":
-        return isinstance(value, dict)
-    if typ == "array":
-        return isinstance(value, list)
-    if typ == "string":
-        return isinstance(value, str)
-    if typ == "integer":
-        return isinstance(value, int) and not isinstance(value, bool)
-    if typ == "number":
-        return (isinstance(value, int) or isinstance(value, float)) and not isinstance(value, bool)
-    if typ == "boolean":
-        return isinstance(value, bool)
-    if typ == "null":
-        return value is None
-    return True
-
-
-def validate_schema(value: Any, schema: dict[str, Any], path: str = "$") -> None:
-    if "anyOf" in schema:
-        errors: list[str] = []
-        for option in schema["anyOf"]:
-            try:
-                validate_schema(value, option, path)
-                return
-            except ValidationError as exc:
-                errors.append(str(exc))
-        raise ValidationError(f"{path}: did not match anyOf: {'; '.join(errors)}")
-
-    if "const" in schema and value != schema["const"]:
-        raise ValidationError(f"{path}: expected const {schema['const']!r}, got {value!r}")
-    if "enum" in schema and value not in schema["enum"]:
-        raise ValidationError(f"{path}: expected one of {schema['enum']!r}, got {value!r}")
-
-    typ = schema.get("type")
-    if isinstance(typ, list):
-        if not any(type_matches(value, t) for t in typ):
-            raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ}")
-    elif isinstance(typ, str) and not type_matches(value, typ):
-        raise ValidationError(f"{path}: wrong type {type(value).__name__}, expected {typ}")
-
-    if typ == "object" or "properties" in schema:
-        if not isinstance(value, dict):
-            raise ValidationError(f"{path}: expected object")
-        props = schema.get("properties", {})
-        for key in schema.get("required", []):
-            if key not in value:
-                raise ValidationError(f"{path}: missing required property {key!r}")
-        if schema.get("additionalProperties") is False:
-            extra = sorted(set(value) - set(props))
-            if extra:
-                raise ValidationError(f"{path}: extra properties {extra!r}")
-        for key, sub in props.items():
-            if key in value:
-                validate_schema(value[key], sub, f"{path}.{key}")
-
-    if typ == "array" or "items" in schema:
-        if not isinstance(value, list):
-            raise ValidationError(f"{path}: expected array")
-        min_items = schema.get("minItems")
-        max_items = schema.get("maxItems")
-        if min_items is not None and len(value) < min_items:
-            raise ValidationError(f"{path}: expected at least {min_items} items")
-        if max_items is not None and len(value) > max_items:
-            raise ValidationError(f"{path}: expected at most {max_items} items")
-        items = schema.get("items")
-        if isinstance(items, dict):
-            for i, item in enumerate(value):
-                validate_schema(item, items, f"{path}[{i}]")
-
-    if isinstance(value, str) and "pattern" in schema:
-        if re.fullmatch(schema["pattern"], value) is None:
-            raise ValidationError(f"{path}: {value!r} does not match {schema['pattern']!r}")
-
-    if isinstance(value, (int, float)) and not isinstance(value, bool):
-        if "minimum" in schema and value < schema["minimum"]:
-            raise ValidationError(f"{path}: {value!r} is below minimum {schema['minimum']!r}")
-        if "maximum" in schema and value > schema["maximum"]:
-            raise ValidationError(f"{path}: {value!r} is above maximum {schema['maximum']!r}")
-
-
-def post_json(url: str, payload: dict[str, Any], timeout: float) -> dict[str, Any]:
-    data = json.dumps(payload, separators=(",", ":")).encode("utf-8")
-    req = urllib.request.Request(
-        url,
-        data=data,
-        headers={"Content-Type": "application/json"},
-        method="POST",
-    )
-    try:
-        with urllib.request.urlopen(req, timeout=timeout) as resp:
-            raw = resp.read().decode("utf-8", errors="replace")
-    except urllib.error.HTTPError as exc:
-        raw = exc.read().decode("utf-8", errors="replace")
-        raise RuntimeError(f"HTTP {exc.code}: {raw[:1000]}") from exc
-    except urllib.error.URLError as exc:
-        raise RuntimeError(str(exc)) from exc
-    try:
-        body = json.loads(raw)
-    except json.JSONDecodeError as exc:
-        raise RuntimeError(f"invalid JSON response: {raw[:1000]}") from exc
-    if isinstance(body, dict) and body.get("error"):
-        raise RuntimeError(f"API error: {body['error']!r}")
-    return body
-
-
-def chat_payload(model: str, case: Case, json_object_schema: bool) -> dict[str, Any]:
-    response_format: dict[str, Any]
-    if case.json_object:
-        response_format = {"type": "json_object"}
-        if json_object_schema:
-            response_format["schema"] = {"type": "object"}
-    else:
-        response_format = {
-            "type": "json_schema",
-            "json_schema": {
-                "name": case.name,
-                "strict": True,
-                "schema": case.schema,
-            },
-        }
-    return {
-        "model": model,
-        "messages": [{"role": "user", "content": case.prompt}],
-        "max_tokens": 256,
-        "temperature": 0,
-        "response_format": response_format,
-    }
-
-
-def responses_payload(model: str, case: Case, json_object_schema: bool) -> dict[str, Any]:
-    fmt: dict[str, Any]
-    if case.json_object:
-        fmt = {"type": "json_object"}
-        if json_object_schema:
-            fmt["schema"] = {"type": "object"}
-    else:
-        fmt = {
-            "type": "json_schema",
-            "name": case.name,
-            "strict": True,
-            "schema": case.schema,
-        }
-    return {
-        "model": model,
-        "input": case.prompt,
-        "max_output_tokens": 256,
-        "temperature": 0,
-        "text": {"format": fmt},
-    }
-
-
-def extract_chat_text(body: dict[str, Any]) -> str:
-    choices = body.get("choices")
-    if not isinstance(choices, list) or not choices:
-        raise RuntimeError(f"missing choices in chat response: {body!r}")
-    message = choices[0].get("message", {})
-    content = message.get("content")
-    if isinstance(content, str):
-        return content
-    if isinstance(content, list):
-        out: list[str] = []
-        for part in content:
-            if isinstance(part, dict) and isinstance(part.get("text"), str):
-                out.append(part["text"])
-        return "".join(out)
-    raise RuntimeError(f"missing text content in chat response: {body!r}")
-
-
-def extract_responses_text(body: dict[str, Any]) -> str:
-    if isinstance(body.get("output_text"), str):
-        return body["output_text"]
-    out: list[str] = []
-    for item in body.get("output", []):
-        if not isinstance(item, dict):
-            continue
-        if item.get("type") == "message":
-            for part in item.get("content", []):
-                if isinstance(part, dict) and isinstance(part.get("text"), str):
-                    out.append(part["text"])
-    if out:
-        return "".join(out)
-    raise RuntimeError(f"missing output text in responses response: {body!r}")
-
-
-def check_case(
-    api: str,
-    base_url: str,
-    model: str,
-    case: Case,
-    timeout: float,
-    json_object_schema: bool,
-) -> str:
-    if api == "chat":
-        body = post_json(
-            f"{base_url}/chat/completions",
-            chat_payload(model, case, json_object_schema),
-            timeout,
-        )
-        text = extract_chat_text(body)
-    elif api == "responses":
-        body = post_json(
-            f"{base_url}/responses",
-            responses_payload(model, case, json_object_schema),
-            timeout,
-        )
-        text = extract_responses_text(body)
-    else:
-        raise RuntimeError(f"unknown api {api!r}")
-
-    try:
-        value = json.loads(text.strip())
-    except json.JSONDecodeError as exc:
-        raise RuntimeError(f"{api}/{case.name}: output is not JSON: {text!r}") from exc
-    if not isinstance(value, dict):
-        raise RuntimeError(f"{api}/{case.name}: output is not a JSON object: {value!r}")
-    if case.schema is not None:
-        validate_schema(value, case.schema)
-    return json.dumps(value, ensure_ascii=False, sort_keys=True)
-
-
-def parse_args() -> argparse.Namespace:
-    p = argparse.ArgumentParser()
-    p.add_argument("--base-url", required=True, help="Base URL, usually http://host:port/v1")
-    p.add_argument("--model", required=True)
-    p.add_argument("--apis", default="chat,responses", help="Comma-separated: chat,responses")
-    p.add_argument("--case", action="append", help="Run only this case name; may repeat")
-    p.add_argument("--repeat", type=int, default=1)
-    p.add_argument("--timeout", type=float, default=120.0)
-    p.add_argument(
-        "--json-object-schema",
-        action="store_true",
-        help="Send {'type':'object'} with json_object mode for servers that require a concrete schema.",
-    )
-    p.add_argument("--verbose", action="store_true")
-    return p.parse_args()
-
-
-def main() -> int:
-    args = parse_args()
-    base_url = args.base_url.rstrip("/")
-    apis = [x.strip() for x in args.apis.split(",") if x.strip()]
-    selected = set(args.case or [])
-    cases = [c for c in CASES if not selected or c.name in selected]
-    missing = selected - {c.name for c in CASES}
-    if missing:
-        print(f"unknown case(s): {', '.join(sorted(missing))}", file=sys.stderr)
-        return 2
-
-    failures = 0
-    for repeat in range(args.repeat):
-        for api in apis:
-            for case in cases:
-                label = f"{api}/{case.name}"
-                if args.repeat > 1:
-                    label = f"{label}#{repeat + 1}"
-                t0 = time.time()
-                try:
-                    value = check_case(
-                        api,
-                        base_url,
-                        args.model,
-                        case,
-                        args.timeout,
-                        args.json_object_schema,
-                    )
-                    elapsed = time.time() - t0
-                    if args.verbose:
-                        print(f"PASS {label} {elapsed:.2f}s {value}")
-                    else:
-                        print(f"PASS {label} {elapsed:.2f}s")
-                except Exception as exc:
-                    failures += 1
-                    print(f"FAIL {label}: {exc}", file=sys.stderr)
-    return 1 if failures else 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())

From 9d7b92b06cb218550da9074a7a53fa6432441691 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Sat, 30 May 2026 15:54:41 +0100
Subject: [PATCH 8/9] Remove stale structured output stress test reference

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 6d55447ce..900fbbe2c 100644
--- a/README.md
+++ b/README.md
@@ -1150,7 +1150,6 @@ extractor self-test run first:
 make test                  # ./ds4-eval --self-test-extractors && ./ds4_test --all
 ./ds4_test --logprob-vectors
 ./ds4_test --server
-python3 tests/structured_outputs_stress.py --base-url http://127.0.0.1:8000/v1 --model ds4 --apis chat,responses
 ```
 
 ## Debugging Notes

From 084d7c3db5f1e37f9f5b09a1028ac948b2a84335 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Sat, 30 May 2026 16:37:57 +0100
Subject: [PATCH 9/9] Allow thinking with structured outputs

---
 README.md    |  5 +++--
 ds4_server.c | 28 ++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 900fbbe2c..a478ef6a4 100644
--- a/README.md
+++ b/README.md
@@ -653,8 +653,9 @@ static library there. To use an existing checkout instead, pass
 With that build, `/v1/chat/completions` supports
 `response_format.type=json_schema`, `json_object`, `regex`, `lark`, and
 `llguidance`; `/v1/responses` supports the same modes through `text.format`.
-Structured outputs use constrained decoding, disable thinking for that turn,
-and currently cannot be combined with tools.
+Structured outputs use constrained decoding. If thinking is enabled, the
+constraint applies after `</think>` so the final assistant content is structured.
+They currently cannot be combined with tools.
 
 `/v1/messages` is the Anthropic-compatible endpoint used by Claude Code style
 clients. It accepts `system`, `messages`, `tools`, `tool_choice`, `max_tokens`,
diff --git a/ds4_server.c b/ds4_server.c
index cce33485c..df750a9c8 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -3274,8 +3274,6 @@ static bool parse_chat_request(ds4_engine *e, server *s, const char *body, int d
             request_free(r);
             return false;
         }
-        thinking_enabled = false;
-        got_thinking = true;
     }
     if (!got_thinking && model_alias_disables_thinking(r->model)) thinking_enabled = false;
     if (!got_thinking && model_alias_enables_thinking(r->model)) thinking_enabled = true;
@@ -4448,9 +4446,6 @@ static bool parse_responses_request(ds4_engine *e, server *s, const char *body,
             request_free(r);
             return false;
         }
-        thinking_enabled = false;
-        got_thinking = true;
-        r->reasoning_summary_emit = false;
     }
     if (!got_thinking && model_alias_disables_thinking(r->model)) thinking_enabled = false;
     if (!got_thinking && model_alias_enables_thinking(r->model)) thinking_enabled = true;
@@ -10873,6 +10868,11 @@ static void generate_job(server *s, job *j) {
     double last_decode_log_t = decode_t0;
     int last_decode_log_completion = 0;
     thinking_state thinking = thinking_state_from_prompt(&j->req);
+    bool structured_waiting_for_think_close = structured && thinking.inside;
+    if (structured_waiting_for_think_close) {
+        trace_event(s, trace_id,
+                    "structured output constraint delayed until </think>");
+    }
     const bool thinking_gates_tool_markers = ds4_think_mode_enabled(j->req.think_mode);
     bool tool_scan_waiting_for_think_close =
         thinking_gates_tool_markers && thinking.inside;
@@ -10900,7 +10900,8 @@ static void generate_job(server *s, job *j) {
         if (in_tool_call && !dsml_decode_state_uses_payload_sampling(dsml_state)) {
             temperature = 0.0f;
         }
-        int token = structured ?
+        bool structured_active = structured && !structured_waiting_for_think_close;
+        int token = structured_active ?
             ds4_llguidance_sample(structured, s->session,
                                   temperature, top_k, top_p, min_p,
                                   &rng, err, sizeof(err)) :
@@ -10916,7 +10917,8 @@ static void generate_job(server *s, job *j) {
 
         int toks[17];
         int ntok = 0;
-        if (!structured &&
+        if (!structured_active &&
+            !structured_waiting_for_think_close &&
             temperature <= 0.0f &&
             ds4_engine_mtp_draft_tokens(s->engine) > 1 &&
             getenv("DS4_MTP_SPEC_DISABLE") == NULL)
@@ -10950,7 +10952,8 @@ static void generate_job(server *s, job *j) {
                 stop_decode = true;
                 break;
             }
-            if (structured &&
+            structured_active = structured && !structured_waiting_for_think_close;
+            if (structured_active &&
                 !ds4_llguidance_accept(structured, s->engine, token,
                                        err, sizeof(err))) {
                 finish = "error";
@@ -10964,7 +10967,16 @@ static void generate_job(server *s, job *j) {
 
             trace_piece(s, trace_id, piece, piece_len);
             buf_append(&text, piece, piece_len);
+            bool was_thinking_inside = thinking.inside;
             thinking_state_feed(&thinking, piece, piece_len);
+            if (structured_waiting_for_think_close &&
+                was_thinking_inside &&
+                !thinking.inside)
+            {
+                structured_waiting_for_think_close = false;
+                trace_event(s, trace_id,
+                            "structured output constraint activated after </think>");
+            }
             if (j->req.kind == REQ_CHAT && j->req.has_tools) {
                 dsml_decode_tracker_update(&dsml_tracker, text.ptr, text.len);
             }