diff --git a/tests/test_bench_serve_workload_hardening.py b/tests/test_bench_serve_workload_hardening.py
new file mode 100644
index 00000000..27715d4e
--- /dev/null
+++ b/tests/test_bench_serve_workload_hardening.py
@@ -0,0 +1,581 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Targeted regression tests for the bench-serve workload runner.
+
+The existing ``test_bench_serve.py`` already covers happy paths for workload
+loading, sweep expansion, formatters, runner end-to-end, and basic quality
+checks. Issue #499 highlighted that the load/validate/run/report code paths
+in ``vllm_mlx/bench_serve.py`` are dense and worth pinning further. This
+module fills the remaining corners around the areas the issue calls out:
+
+- ``load_workload`` validation errors. Every guard clause that rejects a
+  malformed workload JSON, plus tag-string normalisation.
+- ``load_workload`` default merging. ``max_tokens``, ``enable_thinking``,
+  and the workload-name fallback to filename stem.
+- Streaming tool-call accumulation. ``accumulate_tool_calls`` and
+  ``finalize_tool_calls`` chunk-boundary behaviour and ordering.
+- ``validate_quality_checks`` diagnostics. Tool-call argument validation
+  edge cases (invalid JSON, non-object, missing keys, count mismatch) and
+  the combination of ``no_tool_calls`` with content-length checks.
+- Artifact schema compatibility. ``summarize_workload_results``,
+  ``format_workload_json``, ``format_workload_csv``, and
+  ``format_workload_table`` are pinned to a stable core set of keys and
+  columns so an accidental rename or deletion during refactor is caught.
+
+Brittleness note: assertions on ``match=`` strings and diagnostic
+substrings deliberately use short, stable anchors instead of full
+sentences. The contract is the diagnostic *intent* (which subject the
+error names, which check failed), not the exact wording.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+
+import pytest
+
+from vllm_mlx.bench_serve import (
+    accumulate_tool_calls,
+    finalize_tool_calls,
+    format_workload_csv,
+    format_workload_json,
+    format_workload_table,
+    load_workload,
+    summarize_workload_results,
+    validate_quality_checks,
+)
+
+# ---------------------------------------------------------------------------
+# load_workload — validation guard clauses
+# ---------------------------------------------------------------------------
+
+
+class TestLoadWorkloadValidation:
+    def _write(self, tmp_path: Path, payload) -> Path:
+        f = tmp_path / "workload.json"
+        f.write_text(json.dumps(payload))
+        return f
+
+    def test_root_must_be_object(self, tmp_path: Path):
+        f = tmp_path / "workload.json"
+        f.write_text(json.dumps(["not", "an", "object"]))
+        with pytest.raises(ValueError, match=r"root.*JSON|JSON.*root"):
+            load_workload(f)
+
+    def test_empty_cases_list_rejected(self, tmp_path: Path):
+        f = self._write(tmp_path, {"cases": []})
+        with pytest.raises(ValueError, match="cases"):
+            load_workload(f)
+
+    def test_missing_cases_key_rejected(self, tmp_path: Path):
+        f = self._write(tmp_path, {"defaults": {}})
+        with pytest.raises(ValueError, match="cases"):
+            load_workload(f)
+
+    def test_defaults_must_be_object(self, tmp_path: Path):
+        f = self._write(
+            tmp_path,
+            {
+                "defaults": "not-an-object",
+                "cases": [{"id": "a", "messages": [{"role": "user", "content": "hi"}]}],
+            },
+        )
+        with pytest.raises(ValueError, match="defaults"):
+            load_workload(f)
+
+    def test_case_must_be_object(self, tmp_path: Path):
+        f = self._write(tmp_path, {"cases": ["not-an-object"]})
+        # ``case`` alone would also match the list-level "non-empty cases"
+        # error; pin to the per-item form.
+        with pytest.raises(ValueError, match=r"case.*must be"):
+            load_workload(f)
+
+    def test_extra_body_invalid_type_rejected(self, tmp_path: Path):
+        f = self._write(
+            tmp_path,
+            {
+                "cases": [
+                    {
+                        "id": "a",
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "extra_body": "not-an-object",
+                    }
+                ]
+            },
+        )
+        # ``dict.update`` on a string raises ValueError with a "sequence"
+        # message before reaching the explicit ``extra_body must be an
+        # object`` check. Anchor on either path so the test still catches
+        # a regression that takes the explicit branch.
+        with pytest.raises((ValueError, TypeError), match=r"extra_body|sequence|dict"):
+            load_workload(f)
+
+    def test_tags_string_is_normalised_to_list(self, tmp_path: Path):
+        f = self._write(
+            tmp_path,
+            {
+                "cases": [
+                    {
+                        "id": "a",
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "tags": "single-tag",
+                    }
+                ]
+            },
+        )
+        workload = load_workload(f)
+        assert workload.cases[0].tags == ("single-tag",)
+
+    def test_tags_invalid_type_rejected(self, tmp_path: Path):
+        f = self._write(
+            tmp_path,
+            {
+                "cases": [
+                    {
+                        "id": "a",
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "tags": 42,
+                    }
+                ]
+            },
+        )
+        with pytest.raises(ValueError, match="tags"):
+            load_workload(f)
+
+    def test_checks_non_dict_truthy_rejected_as_value_error(self, tmp_path: Path):
+        # Before this PR, a non-dict truthy ``checks`` value (string or
+        # list) crashed with a cryptic ``AttributeError`` from
+        # ``checks.items()`` deep inside the loader. Now ``_merge_case_checks``
+        # validates the type and raises a case-scoped ``ValueError``.
+        f = self._write(
+            tmp_path,
+            {
+                "cases": [
+                    {
+                        "id": "a",
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "checks": "not-an-object",
+                    }
+                ]
+            },
+        )
+        with pytest.raises(ValueError, match="checks"):
+            load_workload(f)
+
+
+# ---------------------------------------------------------------------------
+# load_workload — default merging
+# ---------------------------------------------------------------------------
+
+
+class TestLoadWorkloadDefaultMerging:
+    """Pin the default-merging behaviours not already covered by
+    ``tests/test_bench_serve.py``. The existing suite covers basic
+    ``max_tokens`` / ``enable_thinking`` / ``policy_timeout_ms`` propagation
+    from defaults (see ``test_load_workload_with_defaults``) and
+    ``checks`` list-merging. The corners left open are: (a) case-level
+    values winning over defaults, and (b) workload name falling back to
+    the filename stem when no ``name`` field is set.
+    """
+
+    def _write(self, tmp_path: Path, payload, *, name: str = "workload.json") -> Path:
+        f = tmp_path / name
+        f.write_text(json.dumps(payload))
+        return f
+
+    def test_case_value_wins_over_default_max_tokens(self, tmp_path: Path):
+        f = self._write(
+            tmp_path,
+            {
+                "defaults": {"max_tokens": 64},
+                "cases": [
+                    {
+                        "id": "a",
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "max_tokens": 200,
+                    }
+                ],
+            },
+        )
+        workload = load_workload(f)
+        assert workload.cases[0].max_tokens == 200
+
+    def test_workload_name_defaults_to_filename_stem(self, tmp_path: Path):
+        f = self._write(
+            tmp_path,
+            {"cases": [{"id": "a", "messages": [{"role": "user", "content": "hi"}]}]},
+            name="my-suite.json",
+        )
+        workload = load_workload(f)
+        assert workload.name == "my-suite"
+
+
+# ---------------------------------------------------------------------------
+# Streaming tool-call accumulation
+# ---------------------------------------------------------------------------
+
+
+class TestStreamingToolCallAccumulation:
+    def test_concatenates_name_and_arguments_across_deltas(self):
+        acc: dict[int, dict] = {}
+        # First delta: id + name fragment.
+        accumulate_tool_calls(
+            acc,
+            [
+                {
+                    "index": 0,
+                    "id": "call_1",
+                    "type": "function",
+                    "function": {"name": "get_", "arguments": '{"city":'},
+                }
+            ],
+        )
+        # Second delta: rest of name + rest of arguments.
+        accumulate_tool_calls(
+            acc,
+            [
+                {
+                    "index": 0,
+                    "function": {"name": "weather", "arguments": '"Tokyo"}'},
+                }
+            ],
+        )
+        finalised = finalize_tool_calls(acc)
+        assert len(finalised) == 1
+        tc = finalised[0]
+        assert tc["id"] == "call_1"
+        assert tc["function"]["name"] == "get_weather"
+        assert json.loads(tc["function"]["arguments"]) == {"city": "Tokyo"}
+
+    def test_finalize_returns_index_sorted_even_when_inserted_out_of_order(self):
+        acc: dict[int, dict] = {}
+        # Indices arrive out of order: 2, 0, 1.
+        accumulate_tool_calls(
+            acc,
+            [{"index": 2, "id": "c", "function": {"name": "third"}}],
+        )
+        accumulate_tool_calls(
+            acc,
+            [{"index": 0, "id": "a", "function": {"name": "first"}}],
+        )
+        accumulate_tool_calls(
+            acc,
+            [{"index": 1, "id": "b", "function": {"name": "second"}}],
+        )
+        finalised = finalize_tool_calls(acc)
+        assert [tc["function"]["name"] for tc in finalised] == [
+            "first",
+            "second",
+            "third",
+        ]
+
+    def test_id_set_on_first_delta_is_preserved_when_later_delta_omits_id(self):
+        acc: dict[int, dict] = {}
+        accumulate_tool_calls(
+            acc,
+            [{"index": 0, "id": "call_X", "function": {"name": "f"}}],
+        )
+        # Later delta omits the id (only sends argument fragment).
+        accumulate_tool_calls(
+            acc,
+            [{"index": 0, "function": {"arguments": "{}"}}],
+        )
+        assert acc[0]["id"] == "call_X"
+
+    def test_default_index_is_zero_when_omitted(self):
+        """OpenAI's spec says ``index`` is required, but mlx-lm style streams
+        sometimes omit it on the first delta. Accumulator must default to 0
+        rather than raise."""
+        acc: dict[int, dict] = {}
+        accumulate_tool_calls(
+            acc,
+            [{"id": "call_1", "function": {"name": "f", "arguments": "{}"}}],
+        )
+        assert 0 in acc
+        assert acc[0]["function"]["name"] == "f"
+
+
+# ---------------------------------------------------------------------------
+# validate_quality_checks — diagnostics for tool-call argument checks
+# ---------------------------------------------------------------------------
+
+
+class TestQualityCheckDiagnostics:
+    def test_tool_call_count_mismatch_reports_actual_count(self):
+        ok, issues = validate_quality_checks(
+            finish_reason="stop",
+            content="ignored",
+            checks={"tool_call_count": 2},
+            tool_calls=[{"function": {"name": "f", "arguments": "{}"}}],
+        )
+        assert not ok
+        # Anchor on the check name and the two relevant counts. We don't
+        # pin the exact wording around "expected"/"got" so a future
+        # cleanup of the diagnostic phrasing doesn't break the test.
+        assert any(
+            "tool_call_count" in issue and "1" in issue and "2" in issue
+            for issue in issues
+        )
+
+    def test_tool_call_args_invalid_json_reports_issue(self):
+        ok, issues = validate_quality_checks(
+            finish_reason="stop",
+            content="ignored",
+            checks={"tool_call_args_required_keys": {"f": ["x"]}},
+            tool_calls=[{"function": {"name": "f", "arguments": "{not-json"}}],
+        )
+        assert not ok
+        assert any("invalid JSON" in issue for issue in issues)
+
+    def test_tool_call_args_non_object_reports_issue(self):
+        ok, issues = validate_quality_checks(
+            finish_reason="stop",
+            content="ignored",
+            checks={"tool_call_args_required_keys": {"f": ["x"]}},
+            tool_calls=[{"function": {"name": "f", "arguments": "[1, 2, 3]"}}],
+        )
+        assert not ok
+        assert any("not an object" in issue for issue in issues)
+
+    def test_tool_call_args_missing_keys_lists_what_is_missing(self):
+        ok, issues = validate_quality_checks(
+            finish_reason="stop",
+            content="ignored",
+            checks={"tool_call_args_required_keys": {"f": ["a", "b", "c"]}},
+            tool_calls=[{"function": {"name": "f", "arguments": '{"a": 1, "x": 2}'}}],
+        )
+        assert not ok
+        # The diagnostic must name the missing keys so an operator can fix
+        # the prompt or the check, not just say "something is missing".
+        # Find the issue that talks about missing keys, then assert b/c are
+        # listed and the present key 'a' is not. Quoting matters: the keys
+        # are emitted as Python repr so we anchor on "'a'" to avoid
+        # matching the letter 'a' inside other words.
+        missing_issue = next((i for i in issues if "missing" in i), None)
+        assert missing_issue is not None, f"no missing-keys diagnostic in {issues}"
+        assert "'b'" in missing_issue and "'c'" in missing_issue
+        assert "'a'" not in missing_issue
+
+    def test_no_tool_calls_combines_with_other_checks(self):
+        ok, issues = validate_quality_checks(
+            finish_reason="stop",
+            content="hi",
+            checks={"no_tool_calls": True, "min_chars": 100},
+            tool_calls=[{"function": {"name": "f", "arguments": "{}"}}],
+        )
+        assert not ok
+        # Both checks fail; both must surface so the operator sees the full
+        # picture rather than chasing them one at a time.
+        joined = " ".join(issues)
+        assert "no_tool_calls" in joined
+        assert "min_chars" in joined
+
+    def test_finish_reason_list_accepts_any_member(self):
+        # finish_reason="length" is treated as truncation by the basic check
+        # in validate_response, so we exercise a non-truncation alternative
+        # (tool_calls) that the test should accept when present in the list.
+        ok, issues = validate_quality_checks(
+            finish_reason="tool_calls",
+            content="",
+            checks={"finish_reason": ["stop", "tool_calls"]},
+            tool_calls=[{"function": {"name": "f", "arguments": "{}"}}],
+        )
+        assert ok, issues
+
+    def test_finish_reason_string_form_rejects_others(self):
+        # Single-string form: only "stop" is allowed; finish_reason="tool_calls"
+        # must surface as an explicit issue, not a generic basic-check failure.
+        ok, issues = validate_quality_checks(
+            finish_reason="tool_calls",
+            content="",
+            checks={"finish_reason": "stop"},
+            tool_calls=[{"function": {"name": "f", "arguments": "{}"}}],
+        )
+        assert not ok
+        # Anchor on the check name plus the rejected value. Wording around
+        # how the rejection is phrased ("not in", "not allowed", etc.) is
+        # not part of the contract.
+        assert any(
+            "finish_reason" in issue and "tool_calls" in issue for issue in issues
+        )
+
+
+# ---------------------------------------------------------------------------
+# Artifact schema compatibility
+# ---------------------------------------------------------------------------
+
+
+def _make_record(*, case_id: str = "a", quality_ok: bool = True) -> dict:
+    """Minimal but realistic workload record for schema/format tests.
+
+    Mirrors the shape ``run_workload_case`` produces (see
+    ``vllm_mlx.bench_serve.run_workload_case`` for the source-of-truth
+    record builder). Kept local to this module so changes to the runner
+    don't accidentally hide schema drift in the formatters.
+    """
+    return {
+        "run_id": "run-x",
+        "timestamp": "2026-01-01T00:00:00Z",
+        "started_at": "2026-01-01T00:00:00Z",
+        "workload": "demo",
+        "case_id": case_id,
+        "repetition": 0,
+        "tags": [],
+        "model_id": "test/model",
+        "runtime": {
+            "engine_type": "test",
+            "model_type": "tiny",
+            "mtp_enabled": False,
+            "specprefill": False,
+            "kv_quant": "",
+            "cache_type": "paged",
+        },
+        "hardware": {
+            "chip": "M0",
+            "memory_gb": 1.0,
+            "os_version": "darwin-test",
+        },
+        "request": {
+            "max_tokens": 128,
+            "request_path": None,
+            "enable_thinking": False,
+            "extra_body": {},
+            "message_count": 1,
+        },
+        "policy": {"timeout_ms": None, "within_timeout": None},
+        "cache_reset": {"attempted": False},
+        "metrics": {
+            "ttft_ms": 1.0,
+            "tpot_ms": 1.0,
+            "e2e_latency_ms": 10.0,
+            "gen_tps": 10.0,
+            "prompt_tps": 100.0,
+            "prompt_tokens": 10,
+            "completion_tokens": 10,
+            "cache_hits": 0,
+            "cache_misses": 0,
+            "tokens_saved": 0,
+            "metal": {
+                "metal_active_gb": 0.0,
+                "metal_peak_gb": 0.0,
+                "metal_cache_gb": 0.0,
+            },
+        },
+        "quality": {
+            "ok": quality_ok,
+            "issues": [],
+            "finish_reason": "stop",
+            "content_chars": 10,
+            "content_preview": "hi",
+        },
+        "tool_calls": None,
+        "ok": quality_ok,
+    }
+
+
+class TestArtifactSchemaCompatibility:
+    """Pin a stable core set of keys/columns in the workload artifacts so a
+    refactor cannot silently drop a field that downstream operators or
+    release qualification dashboards depend on. The contract is
+    *core-keys-must-exist*, not *exact match*: adding new fields is fine,
+    deleting or renaming a core field breaks the suite.
+    """
+
+    def test_summarize_results_has_core_top_level_keys(self):
+        summary = summarize_workload_results([_make_record()])
+        core = {
+            "case_count",
+            "unique_case_count",
+            "repetition_count",
+            "passed",
+            "failure_count",
+            "failure_rate",
+            "quality_passed",
+            "quality_failure_count",
+            "policy_timeout_passed",
+            "policy_timeout_failure_count",
+            "latency_ms",
+            "ttft_ms",
+            "gen_tps",
+            "case_summaries",
+        }
+        missing = core - summary.keys()
+        assert not missing, f"summary lost core keys: {sorted(missing)}"
+
+    def test_summarize_results_per_case_has_core_keys(self):
+        summary = summarize_workload_results([_make_record(case_id="resume")])
+        case = summary["case_summaries"]["resume"]
+        core = {
+            "sample_count",
+            "repetitions",
+            "passed",
+            "failure_count",
+            "failure_rate",
+            "policy_timeout_passed",
+            "policy_timeout_failure_count",
+            "latency_ms",
+            "ttft_ms",
+            "gen_tps",
+            "content_chars",
+        }
+        missing = core - case.keys()
+        assert not missing, f"per-case summary lost core keys: {sorted(missing)}"
+
+    def test_workload_table_renders_core_columns(self):
+        payload = {"results": [_make_record(case_id="resume")]}
+        rendered = format_workload_table(payload)
+        # Anchor on the column headers that downstream qualification
+        # dashboards rely on. New columns are fine; deleting any of these
+        # is a contract break.
+        for column in (
+            "case_id",
+            "repetition",
+            "ttft_ms",
+            "gen_tps",
+            "e2e_latency_ms",
+            "quality_ok",
+        ):
+            assert column in rendered, f"table missing column: {column}"
+        assert "resume" in rendered
+
+    def test_workload_csv_rows_have_core_columns(self):
+        payload = {"results": [_make_record(case_id="resume")]}
+        output = format_workload_csv(payload)
+        rows = list(csv.DictReader(output.splitlines()))
+        assert len(rows) == 1
+        for column in (
+            "run_id",
+            "timestamp",
+            "workload",
+            "case_id",
+            "repetition",
+            "model_id",
+            "ttft_ms",
+            "gen_tps",
+            "e2e_latency_ms",
+            "quality_ok",
+            "finish_reason",
+        ):
+            assert column in rows[0], f"CSV missing column: {column}"
+        assert rows[0]["case_id"] == "resume"
+
+    def test_workload_json_passes_through_top_level_keys(self):
+        # ``format_workload_json`` is a thin dump, so the contract is that
+        # any top-level keys callers attach (workload metadata, summary,
+        # results) round-trip unchanged. Pin that explicitly so a future
+        # change that rewraps the payload doesn't go unnoticed.
+        payload = {
+            "workload": "demo",
+            "summary": {"passed": True},
+            "results": [_make_record()],
+        }
+        parsed = json.loads(format_workload_json(payload))
+        assert parsed.keys() >= {"workload", "summary", "results"}
+        assert parsed["results"][0]["case_id"] == "a"
+        assert parsed["summary"]["passed"] is True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/vllm_mlx/bench_serve.py b/vllm_mlx/bench_serve.py
index 3e3a639a..6dddea90 100644
--- a/vllm_mlx/bench_serve.py
+++ b/vllm_mlx/bench_serve.py
@@ -178,6 +178,128 @@ def _first_not_none(*values: Any) -> Any:
     return None
 
 
+def _normalize_tags(tags: Any, *, case_id: str) -> tuple[str, ...]:
+    """Coerce a workload case's ``tags`` field to a tuple of strings.
+
+    Accepts either a single string (treated as a one-element list) or a
+    list. Any other type is rejected with a case-scoped ``ValueError``.
+    """
+    if isinstance(tags, str):
+        tags = [tags]
+    if not isinstance(tags, list):
+        raise ValueError(f"{case_id}: tags must be a list or string")
+    return tuple(str(tag) for tag in tags)
+
+
+def _merge_case_checks(
+    default_checks: Any,
+    case_checks: Any,
+    *,
+    case_id: str,
+) -> Optional[dict]:
+    """Merge a case's ``checks`` over the workload defaults.
+
+    Most keys are overridden by the case-level value. The two regex list
+    keys (``required_regex``, ``forbidden_regex``) are list-concatenated
+    with default patterns first and case patterns appended, so
+    case-level patterns extend defaults rather than replace them.
+    Returns ``None`` when neither source contributes any checks.
+
+    Rejects a non-dict ``case_checks`` with a ``ValueError`` named after
+    the case so the operator gets a clear message instead of the
+    ``AttributeError`` that the previous inline code raised on
+    ``case_checks.items()``.
+    """
+    merged: dict = dict(default_checks or {})
+    if not case_checks:
+        return merged or None
+    if not isinstance(case_checks, dict):
+        raise ValueError(f"{case_id}: checks must be an object")
+    for key, value in case_checks.items():
+        if (
+            key in ("required_regex", "forbidden_regex")
+            and isinstance(value, list)
+            and isinstance(merged.get(key), list)
+        ):
+            merged[key] = merged[key] + value
+        else:
+            merged[key] = value
+    return merged or None
+
+
+def _build_workload_case(
+    item: Any,
+    idx: int,
+    *,
+    defaults: dict,
+    workload_path: Path,
+) -> WorkloadCase:
+    """Construct one ``WorkloadCase`` from a raw workload entry.
+
+    Validates the entry shape, loads request defaults from a sibling JSON
+    file when ``request_path`` is provided, merges ``extra_body`` and
+    ``checks`` against the workload defaults, and resolves scalar fields
+    (``max_tokens``, ``enable_thinking``, ``policy_timeout_ms``) via
+    ``_first_not_none`` priority: case-level beats request_path defaults
+    beats workload defaults.
+
+    ``extra_body`` follows a different merge: it composes the
+    ``request_path`` extras (base) with either the case-level
+    ``extra_body`` if present, otherwise the workload-default
+    ``extra_body``. The case-vs-default fallback is a get-with-default,
+    not a three-way merge.
+    """
+    if not isinstance(item, dict):
+        raise ValueError(f"case {idx}: case must be an object")
+    case_id = str(item.get("id") or f"case_{idx + 1}")
+
+    request_path = item.get("request_path")
+    request_defaults: dict = {}
+    if request_path is not None:
+        request_defaults = _load_case_request(
+            str(request_path), workload_path=workload_path, case_id=case_id
+        )
+
+    messages = _require_message_list(
+        item.get("messages", request_defaults.get("messages")),
+        label=case_id,
+    )
+
+    extra_body = item.get("extra_body", defaults.get("extra_body"))
+    request_extra = _request_extra_body(request_defaults)
+    if extra_body:
+        request_extra.update(extra_body)
+    extra_body = request_extra or None
+
+    checks = _merge_case_checks(
+        defaults.get("checks"), item.get("checks"), case_id=case_id
+    )
+
+    return WorkloadCase(
+        case_id=case_id,
+        messages=messages,
+        request_path=str(request_path) if request_path is not None else None,
+        max_tokens=_first_not_none(
+            item.get("max_tokens"),
+            request_defaults.get("max_tokens"),
+            defaults.get("max_tokens"),
+        ),
+        enable_thinking=_first_not_none(
+            item.get("enable_thinking"),
+            request_defaults.get("enable_thinking"),
+            defaults.get("enable_thinking"),
+        ),
+        extra_body=extra_body,
+        policy_timeout_ms=_first_not_none(
+            item.get("policy_timeout_ms"),
+            request_defaults.get("policy_timeout_ms"),
+            defaults.get("policy_timeout_ms"),
+        ),
+        checks=checks,
+        tags=_normalize_tags(item.get("tags", []), case_id=case_id),
+    )
+
+
 def load_workload(path: str | Path) -> Workload:
     """Load a declarative serving benchmark workload.
 
@@ -200,73 +322,10 @@ def load_workload(path: str | Path) -> Workload:
     if not isinstance(defaults, dict):
         raise ValueError("workload defaults must be an object")
 
-    cases: list[WorkloadCase] = []
-    for idx, item in enumerate(raw_cases):
-        if not isinstance(item, dict):
-            raise ValueError(f"case {idx}: case must be an object")
-        case_id = str(item.get("id") or f"case_{idx + 1}")
-        request_path = item.get("request_path")
-        request_defaults: dict = {}
-        if request_path is not None:
-            request_defaults = _load_case_request(
-                str(request_path), workload_path=workload_path, case_id=case_id
-            )
-        messages = _require_message_list(
-            item.get("messages", request_defaults.get("messages")),
-            label=case_id,
-        )
-        extra_body = item.get("extra_body", defaults.get("extra_body"))
-        request_extra = _request_extra_body(request_defaults)
-        if extra_body:
-            request_extra.update(extra_body)
-        extra_body = request_extra or None
-        if extra_body is not None and not isinstance(extra_body, dict):
-            raise ValueError(f"{case_id}: extra_body must be an object")
-        merged_checks = dict(defaults.get("checks") or {})
-        if item.get("checks"):
-            for key, value in item["checks"].items():
-                if (
-                    key in ("required_regex", "forbidden_regex")
-                    and isinstance(value, list)
-                    and isinstance(merged_checks.get(key), list)
-                ):
-                    merged_checks[key] = merged_checks[key] + value
-                else:
-                    merged_checks[key] = value
-        checks = merged_checks or None
-        if checks is not None and not isinstance(checks, dict):
-            raise ValueError(f"{case_id}: checks must be an object")
-        tags = item.get("tags", [])
-        if isinstance(tags, str):
-            tags = [tags]
-        if not isinstance(tags, list):
-            raise ValueError(f"{case_id}: tags must be a list or string")
-
-        cases.append(
-            WorkloadCase(
-                case_id=case_id,
-                messages=messages,
-                request_path=str(request_path) if request_path is not None else None,
-                max_tokens=_first_not_none(
-                    item.get("max_tokens"),
-                    request_defaults.get("max_tokens"),
-                    defaults.get("max_tokens"),
-                ),
-                enable_thinking=_first_not_none(
-                    item.get("enable_thinking"),
-                    request_defaults.get("enable_thinking"),
-                    defaults.get("enable_thinking"),
-                ),
-                extra_body=extra_body,
-                policy_timeout_ms=_first_not_none(
-                    item.get("policy_timeout_ms"),
-                    request_defaults.get("policy_timeout_ms"),
-                    defaults.get("policy_timeout_ms"),
-                ),
-                checks=checks,
-                tags=tuple(str(tag) for tag in tags),
-            )
-        )
+    cases = [
+        _build_workload_case(item, idx, defaults=defaults, workload_path=workload_path)
+        for idx, item in enumerate(raw_cases)
+    ]
 
     return Workload(
         name=str(raw.get("name") or workload_path.stem),