diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index 4307ae44..ee29d1e4 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -39,23 +39,27 @@ jobs:
     steps:
       - name: Get PR Reference
         id: get-ref
-        run: |
-          set -euo pipefail
-          if [ "${{ github.event_name }}" == "issue_comment" ]; then
-            PR_URL="${{ github.event.issue.pull_request.url }}"
-            SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha)
-            if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then
-              echo "::error::Failed to resolve PR head SHA"
-              exit 1
-            fi
-            echo "sha=$SHA" >> "$GITHUB_OUTPUT"
-            echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
-            echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT"
-          else
-            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
-            echo "issue_number=" >> "$GITHUB_OUTPUT"
-            echo "comment_user=" >> "$GITHUB_OUTPUT"
-          fi
+        uses: actions/github-script@v9
+        with:
+          script: |
+            const eventName = context.eventName;
+            if (eventName === 'issue_comment') {
+              const [owner, repo] = context.payload.repository.full_name.split('/');
+              const prNumber = context.payload.issue.number;
+              const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber });
+              const sha = pr.head?.sha;
+              if (!sha) {
+                core.setFailed('Failed to resolve PR head SHA');
+                return;
+              }
+              core.setOutput('sha', sha);
+              core.setOutput('issue_number', String(context.issue.number));
+              core.setOutput('comment_user', context.payload.comment.user.login);
+            } else {
+              core.setOutput('sha', '${{ github.sha }}');
+              core.setOutput('issue_number', '');
+              core.setOutput('comment_user', '');
+            }
 
       - name: Reply with Action Link
         if: github.event_name == 'issue_comment'
@@ -176,15 +180,20 @@ jobs:
         if: always() && needs.check-permission.outputs.trigger_issue_number != ''
         continue-on-error: true
         uses: actions/github-script@v9
+        env:
+          ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }}
+          COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }}
+          TEST_OUTCOME: ${{ steps.run-tests.outcome }}
+          SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }}
         with:
           github-token: ${{ secrets.COMMENT_TOKEN }}
           script: |
-            const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}');
-            const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}';
-            const success = '${{ steps.run-tests.outcome }}' === 'success';
+            const issueNumber = Number(process.env.ISSUE_NUMBER);
+            const commentUser = process.env.COMMENT_USER;
+            const success = process.env.TEST_OUTCOME === 'success';
+            const summary = JSON.parse(process.env.SUMMARY_JSON || '""');
             const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
 
-            const summary = ${{ toJson(steps.summary.outputs.summary) }};
             const icon = success ? '✅' : '❌';
             const status = success ? 'PASSED' : 'FAILED';
             const body = [
diff --git a/.github/workflows/llm-integration.yml b/.github/workflows/llm-benchmark.yml
similarity index 56%
rename from .github/workflows/llm-integration.yml
rename to .github/workflows/llm-benchmark.yml
index dbc07453..89deab51 100644
--- a/.github/workflows/llm-integration.yml
+++ b/.github/workflows/llm-benchmark.yml
@@ -1,4 +1,4 @@
-name: LLM Integration Tests
+name: LLM Benchmark
 
 on:
   workflow_dispatch:
@@ -7,24 +7,12 @@ on:
         description: "LLM model name (leave empty for default)"
         required: false
         default: ""
-      filter:
-        description: "Test category filter (routing|extraction|pipeline|clarification)"
+      scenario:
+        description: "Run a single scenario by ID (leave empty for all)"
         required: false
         default: ""
   issue_comment:
     types: [created]
-  push:
-    branches:
-      - master
-    paths:
-      - 'backend/**'
-      - 'scripts/**'
-      - 'tests/**'
-      - 'sclaw'
-      - 'sclaw_cn'
-      - 'package.json'
-      - '.github/workflows/llm-integration.yml'
-
 permissions:
   contents: read
 
@@ -36,41 +24,44 @@ jobs:
       issues: write
     if: |
       github.event_name == 'workflow_dispatch' ||
-      github.event_name == 'push' ||
       (github.event.issue.pull_request &&
-       contains(github.event.comment.body, '/test-llm') &&
+       contains(github.event.comment.body, '/test-llm-benchmark') &&
        (github.event.comment.user.login == 'guyi2000' || github.event.comment.user.login == 'qinsz01'))
     runs-on: ubuntu-latest
     outputs:
       pr_sha: ${{ steps.get-ref.outputs.sha }}
-      filter_arg: ${{ steps.get-ref.outputs.filter_arg }}
+      scenario_arg: ${{ steps.get-ref.outputs.scenario_arg }}
       trigger_issue_number: ${{ steps.get-ref.outputs.issue_number }}
       trigger_comment_user: ${{ steps.get-ref.outputs.comment_user }}
     steps:
       - name: Get PR Reference
         id: get-ref
-        env:
-          COMMENT_BODY: ${{ github.event.comment.body }}
-        run: |
-          set -euo pipefail
-          if [ "${{ github.event_name }}" == "issue_comment" ]; then
-            PR_URL="${{ github.event.issue.pull_request.url }}"
-            SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha)
-            if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then
-              echo "::error::Failed to resolve PR head SHA"
-              exit 1
-            fi
-            echo "sha=$SHA" >> "$GITHUB_OUTPUT"
-            FILTER=$(echo "$COMMENT_BODY" | sed -n 's|/test-llm[[:space:]]*\(.*\)|\1|p' | xargs)
-            echo "filter_arg=$FILTER" >> "$GITHUB_OUTPUT"
-            echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
-            echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT"
-          else
-            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
-            echo "filter_arg=${{ github.event.inputs.filter }}" >> "$GITHUB_OUTPUT"
-            echo "issue_number=" >> "$GITHUB_OUTPUT"
-            echo "comment_user=" >> "$GITHUB_OUTPUT"
-          fi
+        uses: actions/github-script@v9
+        with:
+          script: |
+            const eventName = context.eventName;
+            if (eventName === 'issue_comment') {
+              const [owner, repo] = context.payload.repository.full_name.split('/');
+              const prNumber = context.payload.issue.number;
+              const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber });
+              const sha = pr.head?.sha;
+              if (!sha) {
+                core.setFailed('Failed to resolve PR head SHA');
+                return;
+              }
+              const body = context.payload.comment.body || '';
+              const match = body.match(/\/test-llm-benchmark\s*([a-zA-Z0-9_-]*)/);
+              const scenario = match ? match[1] : '';
+              core.setOutput('sha', sha);
+              core.setOutput('scenario_arg', scenario);
+              core.setOutput('issue_number', String(context.issue.number));
+              core.setOutput('comment_user', context.payload.comment.user.login);
+            } else {
+              core.setOutput('sha', '${{ github.sha }}');
+              core.setOutput('scenario_arg', '${{ github.event.inputs.scenario }}');
+              core.setOutput('issue_number', '');
+              core.setOutput('comment_user', '');
+            }
 
       - name: Reply with Action Link
         if: github.event_name == 'issue_comment'
@@ -81,9 +72,9 @@ jobs:
           script: |
             const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
             const body = [
-              `✅ **LLM Integration Tests Triggered!**`,
+              `✅ **LLM Benchmark Triggered!**`,
               ``,
-              `@${context.payload.comment.user.login}, I've started the workflow for you.`,
+              `@${context.payload.comment.user.login}, I've started the benchmark run.`,
               `Please click the link below to monitor progress and **approve the deployment** to the test environment:`,
               ``,
               `🚀 **[View Action Run Details](${runUrl})**`,
@@ -96,7 +87,7 @@ jobs:
               body: body
             })
 
-  llm-integration:
+  llm-benchmark:
     needs: check-permission
     permissions:
       contents: read
@@ -104,9 +95,9 @@ jobs:
     environment: test
     runs-on: ubuntu-latest
     concurrency:
-      group: llm-integration-${{ github.event.issue.number || github.sha }}
+      group: llm-benchmark-${{ github.event.issue.number || github.sha }}
       cancel-in-progress: true
-    timeout-minutes: 60
+    timeout-minutes: 90
     env:
       NEXT_TELEMETRY_DISABLED: 1
     steps:
@@ -133,33 +124,44 @@ jobs:
       - name: Build via sclaw
         run: node ./sclaw build
 
-      - name: Run LLM integration tests
-        id: run-tests
+      - name: Run LLM benchmark
+        id: run-benchmark
         env:
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_MODEL: ${{ github.event.inputs.llm_model || vars.LLM_MODEL }}
           LLM_BASE_URL: ${{ vars.LLM_BASE_URL }}
+          LLM_JUDGE_API_KEY: ${{ secrets.LLM_JUDGE_API_KEY || secrets.LLM_API_KEY }}
+          LLM_JUDGE_MODEL: ${{ vars.LLM_JUDGE_MODEL }}
           LLM_LOG_ENABLED: "true"
           LLM_LOG_DIR: ${{ github.workspace }}/.structureclaw/logs
-          DATABASE_URL: "file:.structureclaw/data/structureclaw-llm-ci.db"
+          DATABASE_URL: "file:.structureclaw/data/structureclaw-benchmark-ci.db"
         run: |
           set -o pipefail
-          FILTER_ARG="${{ needs.check-permission.outputs.filter_arg }}"
-          if [ -n "$FILTER_ARG" ]; then
-            node tests/runner.mjs llm-integration --filter "$FILTER_ARG" 2>&1 | tee test-output.txt
+          SCENARIO_ARG="${{ needs.check-permission.outputs.scenario_arg }}"
+          if [ -n "$SCENARIO_ARG" ]; then
+            node tests/runner.mjs llm-benchmark --scenario "$SCENARIO_ARG" --output benchmark-results.json 2>&1 | tee test-output.txt
           else
-            node tests/runner.mjs llm-integration 2>&1 | tee test-output.txt
+            node tests/runner.mjs llm-benchmark --output benchmark-results.json 2>&1 | tee test-output.txt
           fi
 
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: benchmark-results
+          path: |
+            benchmark-results.json
+            .structureclaw/logs/
+
       - name: Extract test summary
         if: always()
         id: summary
         run: |
           if [ ! -f test-output.txt ]; then
-            echo "summary=Tests did not produce output." >> "$GITHUB_OUTPUT"
+            echo "summary=Benchmark did not produce output." >> "$GITHUB_OUTPUT"
             exit 0
           fi
-          SUMMARY=$(grep -B 1 -A 3 '^Results:' test-output.txt || echo "No summary found")
+          SUMMARY=$(grep -B 1 -A 5 '^Benchmark Results:' test-output.txt || echo "No summary found")
           echo "summary<<EOF" >> "$GITHUB_OUTPUT"
           echo "$SUMMARY" >> "$GITHUB_OUTPUT"
           echo "EOF" >> "$GITHUB_OUTPUT"
@@ -168,27 +170,32 @@ jobs:
         if: always() && needs.check-permission.outputs.trigger_issue_number != ''
         continue-on-error: true
         uses: actions/github-script@v9
+        env:
+          ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }}
+          COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }}
+          BENCHMARK_OUTCOME: ${{ steps.run-benchmark.outcome }}
+          SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }}
         with:
           github-token: ${{ secrets.COMMENT_TOKEN }}
           script: |
-            const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}');
-            const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}';
-            const success = '${{ steps.run-tests.outcome }}' === 'success';
-            const summary = ${{ toJson(steps.summary.outputs.summary) }};
+            const issueNumber = Number(process.env.ISSUE_NUMBER);
+            const commentUser = process.env.COMMENT_USER;
+            const success = process.env.BENCHMARK_OUTCOME === 'success';
+            const summary = JSON.parse(process.env.SUMMARY_JSON || '""');
             const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
 
             const icon = success ? '✅' : '❌';
             const status = success ? 'PASSED' : 'FAILED';
             const body = [
-              `${icon} **LLM Integration Tests ${status}**`,
+              `${icon} **LLM Benchmark ${status}**`,
               ``,
-              `@${commentUser}, the test run has completed.`,
+              `@${commentUser}, the benchmark run has completed.`,
               ``,
               `\`\`\``,
               summary,
               `\`\`\``,
               ``,
-              `📦 **[Download Logs](${runUrl})** or check the Artifacts section.`,
+              `📦 **[Download Results](${runUrl})** or check the Artifacts section.`,
             ].join('\n');
 
             await github.rest.issues.createComment({
@@ -197,12 +204,3 @@ jobs:
               repo: context.repo.repo,
               body: body
             })
-
-      - name: Upload LLM call logs
-        if: always()
-        uses: actions/upload-artifact@v7
-        with:
-          name: llm-logs-ubuntu
-          path: .structureclaw/logs/*.jsonl
-          retention-days: 7
-          if-no-files-found: ignore
diff --git a/AGENTS.md b/AGENTS.md
index c331e03a..641d924f 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -67,7 +67,8 @@ node tests/runner.mjs validate validate-agent-orchestration  # Agent orchestrati
 node tests/runner.mjs validate validate-chat-stream-contract # Chat stream contract
 node tests/runner.mjs validate validate-analyze-contract     # Analyze endpoint contract
 node tests/runner.mjs smoke-native                           # Full native install smoke (mirrors CI)
-node tests/runner.mjs llm-integration                        # LLM integration tests (needs LLM_API_KEY)
+node tests/runner.mjs llm-benchmark                          # LLM benchmark: v2 assertions, skill-trace, LLM-as-Judge (needs LLM_API_KEY)
+node tests/runner.mjs llm-benchmark --scenario <id>          # Run a single benchmark scenario
 ```
 
 ## Key Conventions
diff --git a/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json
index 1bbe4762..968cf506 100644
--- a/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json
+++ b/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json
@@ -6,7 +6,9 @@
       "scenarioId": "beam-routing-zh",
       "category": "routing",
       "locale": "zh",
-      "messages": ["一根简支梁，跨度6米"],
+      "messages": [
+        "一根简支梁，跨度6米"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -20,7 +22,9 @@
       "scenarioId": "beam-routing-en",
       "category": "routing",
       "locale": "en",
-      "messages": ["a simply supported beam, span 6m"],
+      "messages": [
+        "a simply supported beam, span 6m"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -29,274 +33,6 @@
           }
         }
       }
-    },
-    {
-      "scenarioId": "beam-params-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["一根悬臂梁，长3米，端部集中力10kN"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["beam"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "beam",
-            "criticalMissing": [],
-            "draftPatch": {
-              "lengthM": {
-                "value": 3,
-                "tolerance": 0.05
-              },
-              "supportType": "cantilever",
-              "loadKN": {
-                "value": 10,
-                "tolerance": 0.05
-              },
-              "loadType": "point"
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "beam-params-en",
-      "category": "extraction",
-      "locale": "en",
-      "messages": ["cantilever beam, 4m long, point load 15kN at tip"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["beam"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "beam",
-            "criticalMissing": [],
-            "draftPatch": {
-              "lengthM": {
-                "value": 4,
-                "tolerance": 0.05
-              },
-              "supportType": "cantilever",
-              "loadKN": {
-                "value": 15,
-                "tolerance": 0.05
-              },
-              "loadType": "point"
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "beam-distributed-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["简支梁6米，均布荷载20kN/m"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["beam"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "beam",
-            "criticalMissing": [],
-            "draftPatch": {
-              "lengthM": {
-                "value": 6,
-                "tolerance": 0.05
-              },
-              "supportType": "simply-supported",
-              "loadKN": {
-                "value": 20,
-                "tolerance": 0.05
-              },
-              "loadType": "distributed"
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "beam-position-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["简支梁8米，在距左端3米处作用集中力25kN"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["beam"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "beam",
-            "criticalMissing": [],
-            "draftPatch": {
-              "lengthM": {
-                "value": 8,
-                "tolerance": 0.05
-              },
-              "loadKN": {
-                "value": 25,
-                "tolerance": 0.05
-              },
-              "loadPositionM": {
-                "value": 3,
-                "tolerance": 0.05
-              }
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "beam-pipeline-zh",
-      "category": "pipeline",
-      "locale": "zh",
-      "messages": ["简支梁6米，均布荷载20kN/m，请进行静力分析"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["beam", "opensees-static", "validation-structure-model"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "routing": {
-              "structuralSkillId": "beam",
-              "analysisSkillId": "opensees-static"
-            },
-            "success": true,
-            "toolCalls": [
-              "build_model",
-              "run_analysis"
-            ],
-            "modelBuilt": true,
-            "analysisSuccess": true
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic", "opensees-static"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": {
-              "structuralSkillId": "generic",
-              "analysisSkillId": "opensees-static"
-            }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "beam-clarify-zh",
-      "category": "clarification",
-      "locale": "zh",
-      "turns": [
-        {
-          "message": "一根梁",
-          "assertions": {
-            "criticalMissingIncludes": [
-              "lengthM"
-            ],
-            "modelBuilt": false
-          }
-        },
-        {
-          "message": "6米",
-          "assertions": {
-            "criticalMissingIncludes": [
-              "loadKN"
-            ],
-            "modelBuilt": false
-          }
-        },
-        {
-          "message": "20kN均布荷载",
-          "assertions": {
-            "criticalMissing": [],
-            "modelBuilt": true,
-            "draftPatch": {
-              "lengthM": {
-                "value": 6,
-                "tolerance": 0.05
-              },
-              "supportType": "simply-supported",
-              "loadKN": {
-                "value": 20,
-                "tolerance": 0.05
-              },
-              "loadType": "distributed"
-            }
-          }
-        }
-      ],
-      "variants": {
-        "legacy": {
-          "expect": {}
-        }
-      }
-    },
-    {
-      "scenarioId": "beam-clarify-revise",
-      "category": "clarification",
-      "locale": "zh",
-      "turns": [
-        {
-          "message": "简支梁6米，均布荷载10kN/m",
-          "assertions": {
-            "criticalMissing": [],
-            "modelBuilt": true,
-            "draftPatch": {
-              "lengthM": {
-                "value": 6,
-                "tolerance": 0.05
-              },
-              "loadKN": {
-                "value": 10,
-                "tolerance": 0.05
-              }
-            }
-          }
-        },
-        {
-          "message": "改成8米",
-          "assertions": {
-            "criticalMissing": [],
-            "modelBuilt": true,
-            "draftPatch": {
-              "lengthM": {
-                "value": 8,
-                "tolerance": 0.05
-              }
-            }
-          }
-        }
-      ],
-      "variants": {
-        "legacy": {
-          "expect": {}
-        }
-      }
     }
   ]
 }
diff --git a/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json
index 94d1aa74..5ebaf6f9 100644
--- a/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json
+++ b/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json
@@ -6,7 +6,9 @@
       "scenarioId": "double-span-beam-routing-zh",
       "category": "routing",
       "locale": "zh",
-      "messages": ["双跨连续梁，总长12m，两跨各6m"],
+      "messages": [
+        "双跨连续梁，总长12m，两跨各6m"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -15,28 +17,6 @@
           }
         }
       }
-    },
-    {
-      "scenarioId": "double-span-beam-basic",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["双跨连续梁，总长12m，两跨各6m，均布荷载18kN/m"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["double-span-beam"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "double-span-beam" }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
     }
   ]
 }
diff --git a/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json
index 3118adff..6768b089 100644
--- a/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json
+++ b/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json
@@ -6,7 +6,9 @@
       "scenarioId": "frame-routing-zh",
       "category": "routing",
       "locale": "zh",
-      "messages": ["三层单跨钢框架，层高3.6m，跨度6m"],
+      "messages": [
+        "三层单跨钢框架，层高3.6m，跨度6m"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -20,7 +22,9 @@
       "scenarioId": "frame-routing-en",
       "category": "routing",
       "locale": "en",
-      "messages": ["3-story single-bay steel frame, story height 3.6m, bay width 6m"],
+      "messages": [
+        "3-story single-bay steel frame, story height 3.6m, bay width 6m"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -29,179 +33,6 @@
           }
         }
       }
-    },
-    {
-      "scenarioId": "frame-extraction-multi-story",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["4层2跨钢框架，层高3.3m，跨度5.4m，楼面荷载12kN/m²"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["frame"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "frame",
-            "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"],
-            "draftPatch": {
-              "storyCount": 4,
-              "bayCount": 2,
-              "storyHeightsM": { "value": [3.3, 3.3, 3.3, 3.3], "tolerance": 0.05 },
-              "bayWidthsM": { "value": [5.4, 5.4], "tolerance": 0.05 }
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "frame-params-en",
-      "category": "extraction",
-      "locale": "en",
-      "messages": ["2-story 3-bay frame, story height 3.6m, bay widths 6m, 7.5m, 6m"],
-      "variants": {
-        "legacy": {
-          "expect": {
-            "inferredType": "frame",
-            "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"],
-            "draftPatch": {
-              "storyCount": 2,
-              "bayCount": 3,
-              "bayWidthsM": { "value": [6, 7.5, 6], "tolerance": 0.05 }
-            }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "frame-steel-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["6层3跨钢框架Q345，层高3.6m，跨度7.2m"],
-      "variants": {
-        "legacy": {
-          "expect": {
-            "inferredType": "frame",
-            "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"],
-            "draftPatch": {
-              "storyCount": 6,
-              "bayCount": 3,
-              "frameMaterial": "Q345"
-            }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "frame-3d-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["3层2x3跨空间框架，层高4m，X向跨度6m，Y向跨度7.5m"],
-      "variants": {
-        "legacy": {
-          "expect": {
-            "inferredType": "frame",
-            "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCountX", "bayCountY", "bayWidthsXM", "bayWidthsYM"],
-            "draftPatch": {
-              "frameDimension": "3d",
-              "storyCount": 3,
-              "bayCountX": 2,
-              "bayCountY": 3
-            }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "frame-static-basic",
-      "category": "pipeline",
-      "locale": "en",
-      "messages": ["2-story single-bay steel frame, story height 3.6m, bay 6m, floor load 10kN/m2, analyze and check against steel code"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["frame", "opensees-static", "code-check-gb50017", "validation-structure-model", "postprocess-builtin"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "routing": {
-              "selectedSkillIds": ["code-check-gb50017", "frame", "opensees-static", "validation-structure-model", "postprocess-builtin"],
-              "structuralSkillId": "frame",
-              "analysisSkillId": "opensees-static"
-            },
-            "toolCalls": ["build_model", "run_analysis", "run_code_check"],
-            "toolAuthorizers": {
-              "build_model": ["frame"],
-              "run_analysis": ["opensees-static"],
-              "run_code_check": ["code-check-gb50017"]
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic", "opensees-static", "code-check-gb50017", "validation-structure-model", "postprocess-builtin"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": {
-              "selectedSkillIds": ["code-check-gb50017", "generic", "opensees-static", "validation-structure-model", "postprocess-builtin"],
-              "structuralSkillId": "generic",
-              "analysisSkillId": "opensees-static"
-            },
-            "toolCalls": ["build_model", "run_analysis", "run_code_check"],
-            "toolAuthorizers": {
-              "run_code_check": ["code-check-gb50017"]
-            }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "frame-pipeline-multi-bay-zh",
-      "category": "pipeline",
-      "locale": "zh",
-      "messages": ["3层2跨框架，层高3.3m，跨度5.4m和6m，每层楼面荷载15kN/m，请进行静力分析并输出报告"],
-      "enabledSkillIds": ["frame", "opensees-static", "validation-structure-model"],
-      "variants": {
-        "legacy": {
-          "expect": {
-            "success": true,
-            "toolCalls": ["build_model", "run_analysis"],
-            "modelBuilt": true,
-            "analysisSuccess": true
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "frame-clarify-en",
-      "category": "clarification",
-      "locale": "en",
-      "turns": [
-        {
-          "message": "a steel frame",
-          "assertions": {
-            "criticalMissingIncludes": ["storyCount"],
-            "modelBuilt": false
-          }
-        },
-        {
-          "message": "3 stories, 4.2m each, single bay 8m, floor load 12kN/m2",
-          "assertions": {
-            "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"],
-            "draftPatch": {
-              "storyCount": 3,
-              "bayCount": 1
-            }
-          }
-        }
-      ],
-      "variants": {
-        "legacy": {
-          "expect": {}
-        }
-      }
     }
   ]
 }
diff --git a/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json
index bdad1f62..b24b2864 100644
--- a/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json
+++ b/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json
@@ -6,7 +6,9 @@
       "scenarioId": "portal-frame-routing-zh",
       "category": "routing",
       "locale": "zh",
-      "messages": ["门式刚架，跨度18m，高度6m"],
+      "messages": [
+        "门式刚架，跨度18m，高度6m"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -20,7 +22,9 @@
       "scenarioId": "portal-frame-routing-en",
       "category": "routing",
       "locale": "en",
-      "messages": ["portal frame, span 18m, height 6m"],
+      "messages": [
+        "portal frame, span 18m, height 6m"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -29,166 +33,6 @@
           }
         }
       }
-    },
-    {
-      "scenarioId": "portal-frame-params-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["门式刚架，跨度24m，高度8m，屋面荷载5kN/m"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["portal-frame"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "portal-frame",
-            "criticalMissing": [],
-            "draftPatch": {
-              "spanLengthM": {
-                "value": 24,
-                "tolerance": 0.05
-              },
-              "heightM": {
-                "value": 8,
-                "tolerance": 0.05
-              },
-              "loadKN": {
-                "value": 5,
-                "tolerance": 0.05
-              }
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "portal-frame-params-en",
-      "category": "extraction",
-      "locale": "en",
-      "messages": ["portal frame, span 21m, height 7.5m, roof load 8kN"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["portal-frame"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "portal-frame",
-            "criticalMissing": [],
-            "draftPatch": {
-              "spanLengthM": {
-                "value": 21,
-                "tolerance": 0.05
-              },
-              "heightM": {
-                "value": 7.5,
-                "tolerance": 0.05
-              },
-              "loadKN": {
-                "value": 8,
-                "tolerance": 0.05
-              }
-            }
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "portal-frame-pipeline-zh",
-      "category": "pipeline",
-      "locale": "zh",
-      "messages": ["门式刚架，跨度18m，高度7m，屋面荷载6kN/m，分析"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["portal-frame", "opensees-static", "validation-structure-model"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "routing": {
-              "structuralSkillId": "portal-frame",
-              "analysisSkillId": "opensees-static"
-            },
-            "success": true,
-            "toolCalls": [
-              "build_model",
-              "run_analysis"
-            ],
-            "modelBuilt": true,
-            "analysisSuccess": true
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic", "opensees-static"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": {
-              "structuralSkillId": "generic",
-              "analysisSkillId": "opensees-static"
-            }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "portal-frame-clarify-zh",
-      "category": "clarification",
-      "locale": "zh",
-      "turns": [
-        {
-          "message": "门式刚架",
-          "assertions": {
-            "criticalMissingIncludes": [
-              "spanLengthM"
-            ],
-            "modelBuilt": false
-          }
-        },
-        {
-          "message": "跨度24m，高8m",
-          "assertions": {
-            "criticalMissingIncludes": [
-              "loadKN"
-            ],
-            "modelBuilt": false
-          }
-        },
-        {
-          "message": "荷载10kN/m",
-          "assertions": {
-            "criticalMissing": [],
-            "modelBuilt": true,
-            "draftPatch": {
-              "spanLengthM": {
-                "value": 24,
-                "tolerance": 0.05
-              },
-              "heightM": {
-                "value": 8,
-                "tolerance": 0.05
-              },
-              "loadKN": {
-                "value": 10,
-                "tolerance": 0.05
-              }
-            }
-          }
-        }
-      ],
-      "variants": {
-        "legacy": {
-          "expect": {}
-        }
-      }
     }
   ]
 }
diff --git a/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json
index 51505500..95b31439 100644
--- a/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json
+++ b/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json
@@ -6,7 +6,9 @@
       "scenarioId": "truss-routing-zh",
       "category": "routing",
       "locale": "zh",
-      "messages": ["三角桁架，跨度12m，高3m，节点荷载20kN"],
+      "messages": [
+        "三角桁架，跨度12m，高3m，节点荷载20kN"
+      ],
       "variants": {
         "legacy": {
           "expect": {
@@ -15,62 +17,6 @@
           }
         }
       }
-    },
-    {
-      "scenarioId": "truss-extraction-zh",
-      "category": "extraction",
-      "locale": "zh",
-      "messages": ["三角桁架，跨度12m，高3m，节点荷载20kN"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["truss"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "inferredType": "truss"
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": { "structuralSkillId": "generic" }
-          }
-        }
-      }
-    },
-    {
-      "scenarioId": "truss-static-basic",
-      "category": "pipeline",
-      "locale": "zh",
-      "messages": ["三角桁架，跨度12m，高3m，节点荷载20kN，做静力分析"],
-      "variants": {
-        "specific": {
-          "enabledSkillIds": ["truss", "opensees-static", "validation-structure-model"],
-          "fallbackPolicy": "forbid-generic",
-          "expect": {
-            "routing": {
-              "structuralSkillId": "truss",
-              "analysisSkillId": "opensees-static"
-            },
-            "success": true,
-            "toolCalls": ["build_model", "run_analysis"],
-            "analysisSuccess": true
-          }
-        },
-        "generic": {
-          "enabledSkillIds": ["generic", "opensees-static", "validation-structure-model"],
-          "fallbackPolicy": "require-generic",
-          "expect": {
-            "routing": {
-              "structuralSkillId": "generic",
-              "analysisSkillId": "opensees-static"
-            },
-            "success": true,
-            "toolCalls": ["build_model", "run_analysis"],
-            "analysisSuccess": true
-          }
-        }
-      }
     }
   ]
 }
diff --git a/tests/llm-benchmark/lib/evaluate.cjs b/tests/llm-benchmark/lib/evaluate.cjs
index 68afa30b..7c1ca551 100644
--- a/tests/llm-benchmark/lib/evaluate.cjs
+++ b/tests/llm-benchmark/lib/evaluate.cjs
@@ -1,65 +1,248 @@
 /**
  * Evaluate a completed AgentState against a scenario's expectations.
  *
+ * Supports v2 assertions (type-dispatched) with automatic v1 backward compatibility.
  * Returns a structured result with per-metric pass/fail and an overall score.
+ *
+ * evaluateScenario is async because natural_language assertions use LLM-as-Judge.
  */
-function evaluateScenario(scenario, state, durationMs) {
-  const metrics = [];
-  const expect = scenario.expect || {};
 
-  // Structural type detection
-  if (expect.structuralType) {
-    const actual = state.structuralTypeKey || null;
-    metrics.push({
-      metric: "structuralType",
-      pass: actual === expect.structuralType,
-      expected: expect.structuralType,
-      actual: actual || "(none)",
-    });
+const { extractSkillTrace } = require("./skill-trace.cjs");
+const { evaluateNaturalLanguage } = require("./judge.cjs");
+
+const ANALYSIS_RESULT_KEYS = [
+  "displacements", "nodeDisplacements", "reactions",
+  "nodeReactions", "memberForces", "forces",
+];
+
+// ---------------------------------------------------------------------------
+// v1 → v2 auto-upgrade
+// ---------------------------------------------------------------------------
+
+/**
+ * Upgrade a v1 scenario expect object to the v2 assertions array format.
+ * v2 format is used when `expect.assertions` is already present.
+ *
+ * @param {object} expect - scenario.expect
+ * @returns {{ assertions: object[] }}
+ */
+function upgradeExpect(expect) {
+  if (Array.isArray(expect.assertions)) {
+    return { assertions: expect.assertions };
   }
 
-  // Model building
+  // v1 → v2 conversion
+  const assertions = [];
+
+  if (expect.structuralType) {
+    assertions.push({ type: "structural_type", expected: expect.structuralType });
+  }
   if (expect.hasModel) {
-    const model = state.model;
-    const nodes = Array.isArray(model?.nodes) ? model.nodes : [];
-    const elements = Array.isArray(model?.elements) ? model.elements : [];
-    const minNodes = expect.minNodes ?? 2;
-    const minElements = expect.minElements ?? 1;
-    metrics.push({
-      metric: "model",
-      pass: !!model && nodes.length >= minNodes && elements.length >= minElements,
-      expected: `>= ${minNodes} nodes, >= ${minElements} elements`,
-      actual: model ? `${nodes.length} nodes, ${elements.length} elements` : "(none)",
+    assertions.push({
+      type: "has_model",
+      minNodes: expect.minNodes ?? 2,
+      minElements: expect.minElements ?? 1,
     });
   }
-
-  // Analysis completion
   if (expect.hasAnalysis) {
-    const analysis = state.analysisResult;
-    const hasDisplacements = analysis && (
-      Array.isArray(analysis.displacements) || Array.isArray(analysis.nodeDisplacements)
-    );
-    metrics.push({
-      metric: "analysis",
-      pass: !!analysis && (hasDisplacements || Object.keys(analysis).length > 0),
+    assertions.push({ type: "has_analysis" });
+  }
+  if (expect.hasReport) {
+    assertions.push({ type: "has_report" });
+  }
+
+  return { assertions };
+}
+
+// ---------------------------------------------------------------------------
+// Typed assertion evaluators
+// ---------------------------------------------------------------------------
+
+function evalStructuralType(assertion, state) {
+  const actual = state.structuralTypeKey || null;
+  return {
+    metric: "structural_type",
+    pass: actual === assertion.expected,
+    expected: assertion.expected,
+    actual: actual || "(none)",
+  };
+}
+
+function evalHasModel(assertion, state) {
+  const model = state.model;
+  const nodes = Array.isArray(model?.nodes) ? model.nodes : [];
+  const elements = Array.isArray(model?.elements) ? model.elements : [];
+  const minNodes = assertion.minNodes ?? 2;
+  const minElements = assertion.minElements ?? 1;
+  return {
+    metric: "has_model",
+    pass: !!model && nodes.length >= minNodes && elements.length >= minElements,
+    expected: `>= ${minNodes} nodes, >= ${minElements} elements`,
+    actual: model ? `${nodes.length} nodes, ${elements.length} elements` : "(none)",
+  };
+}
+
+function hasResultField(obj, names) {
+  if (!obj || typeof obj !== "object") return false;
+  for (const name of names) {
+    const val = obj[name];
+    if (!val) continue;
+    if (Array.isArray(val) && val.length > 0) return true;
+    if (typeof val === "object" && Object.keys(val).length > 0) return true;
+  }
+  return false;
+}
+
+function evalHasAnalysis(_assertion, state) {
+  const analysis = state.analysisResult;
+  if (!analysis) {
+    return {
+      metric: "has_analysis",
+      pass: false,
       expected: "analysis results present",
-      actual: analysis ? "present" : "(none)",
-    });
+      actual: "(none)",
+    };
   }
+  const pass = hasResultField(analysis, ANALYSIS_RESULT_KEYS) || hasResultField(analysis.data, ANALYSIS_RESULT_KEYS);
+  return {
+    metric: "has_analysis",
+    pass,
+    expected: "analysis results with displacements, reactions, or forces",
+    actual: pass ? "present" : `keys: ${Object.keys(analysis).join(", ") || "(empty)"}`,
+  };
+}
 
-  // Report generation
-  if (expect.hasReport) {
-    const report = state.report;
-    const mdLength = typeof report?.markdown === "string" ? report.markdown.length : 0;
-    metrics.push({
-      metric: "report",
-      pass: mdLength > 100,
-      expected: "markdown > 100 chars",
-      actual: report ? `${mdLength} chars` : "(none)",
-    });
+function evalHasReport(_assertion, state) {
+  const report = state.report;
+  const mdLength = typeof report?.markdown === "string" ? report.markdown.length : 0;
+  return {
+    metric: "has_report",
+    pass: mdLength > 100,
+    expected: "markdown > 100 chars",
+    actual: report ? `${mdLength} chars` : "(none)",
+  };
+}
+
+function evalSkillMatch(assertion, state) {
+  const trace = extractSkillTrace(Array.isArray(state.messages) ? state.messages : []);
+  const actual = trace?.skillId || null;
+  const primary = assertion.primary;
+  const mayAlsoMatch = Array.isArray(assertion.mayAlsoMatch) ? assertion.mayAlsoMatch : [];
+  const allowed = primary ? [primary, ...mayAlsoMatch] : mayAlsoMatch;
+
+  // If no allowed skills specified, match any non-null skill
+  if (allowed.length === 0) {
+    return {
+      metric: "skill_match",
+      pass: actual !== null,
+      expected: "(any skill)",
+      actual: actual || "(none)",
+    };
+  }
+
+  const pass = actual !== null && allowed.includes(actual);
+  return {
+    metric: "skill_match",
+    pass,
+    expected: primary ? `${primary}${mayAlsoMatch.length ? ` (or: ${mayAlsoMatch.join(", ")})` : ""}` : "(any)",
+    actual: actual || "(none)",
+  };
+}
+
+function evalHasInteractionQuestions(_assertion, state) {
+  const messages = Array.isArray(state.messages) ? state.messages : [];
+  const hasQuestions = messages.some((msg) => {
+    if (msg.type !== "ai" && msg.role !== "assistant") return false;
+    if (Array.isArray(msg.tool_calls)) {
+      return msg.tool_calls.some(
+        (tc) => tc.name === "ask_user_clarification",
+      );
+    }
+    return false;
+  });
+  return {
+    metric: "has_interaction_questions",
+    pass: hasQuestions,
+    expected: "agent asks user for missing parameters",
+    actual: hasQuestions ? "questions found" : "no questions asked",
+  };
+}
+
+async function evalNaturalLanguage(assertion, state) {
+  const result = await evaluateNaturalLanguage(assertion.description, state);
+  const suffix = result.reason ? ` — ${result.reason}` : "";
+  return {
+    metric: "natural_language",
+    pass: result.pass,
+    expected: assertion.description,
+    actual: result.pass ? "satisfied" : `not satisfied${suffix}`,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Main evaluator
+// ---------------------------------------------------------------------------
+
+/**
+ * Dispatch a single assertion to its typed evaluator.
+ *
+ * @param {object} assertion - v2 assertion object
+ * @param {object} state - AgentState
+ * @returns {Promise<{ metric: string, pass: boolean, expected: string, actual: string }>}
+ */
+async function dispatchAssertion(assertion, state) {
+  switch (assertion.type) {
+    case "structural_type":
+      return evalStructuralType(assertion, state);
+    case "has_model":
+      return evalHasModel(assertion, state);
+    case "has_analysis":
+      return evalHasAnalysis(assertion, state);
+    case "has_report":
+      return evalHasReport(assertion, state);
+    case "skill_match":
+      return evalSkillMatch(assertion, state);
+    case "has_interaction_questions":
+      return evalHasInteractionQuestions(assertion, state);
+    case "natural_language":
+      return evalNaturalLanguage(assertion, state);
+    default:
+      return {
+        metric: `unknown:${assertion.type || "undefined"}`,
+        pass: false,
+        expected: "valid assertion type (structural_type, has_model, has_analysis, has_report, skill_match, has_interaction_questions, natural_language)",
+        actual: `unsupported type: ${assertion.type || "(undefined)"}`,
+      };
+  }
+}
+
+/**
+ * Evaluate a completed AgentState against a scenario's expectations.
+ *
+ * @param {object} scenario - benchmark scenario (v1 or v2 format)
+ * @param {object} state - AgentState returned by service.runFull
+ * @param {number} durationMs - elapsed time in milliseconds
+ * @returns {Promise<object>} evaluation result
+ */
+async function evaluateScenario(scenario, state, durationMs) {
+  const metrics = [];
+  const { assertions } = upgradeExpect(scenario.expect || {});
+
+  for (const assertion of assertions) {
+    try {
+      metrics.push(await dispatchAssertion(assertion, state));
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      metrics.push({
+        metric: assertion.type || "unknown",
+        pass: false,
+        expected: "(assertion ran without error)",
+        actual: `error: ${msg}`,
+      });
+    }
   }
 
-  // Tool call count (informational)
+  // Tool call count (informational — always measured)
   let toolCallCount = 0;
   const messages = Array.isArray(state.messages) ? state.messages : [];
   for (const msg of messages) {
diff --git a/tests/llm-benchmark/lib/judge.cjs b/tests/llm-benchmark/lib/judge.cjs
new file mode 100644
index 00000000..934fa2d6
--- /dev/null
+++ b/tests/llm-benchmark/lib/judge.cjs
@@ -0,0 +1,234 @@
+/**
+ * LLM-as-Judge evaluator for natural_language assertions.
+ *
+ * Configuration (via environment variables):
+ *   LLM_JUDGE_MODEL    — model to use (falls back to LLM_MODEL, then "gpt-4o-mini")
+ *   LLM_JUDGE_API_KEY  — API key (falls back to LLM_API_KEY)
+ *   LLM_JUDGE_BASE_URL — base URL (falls back to LLM_BASE_URL, then "https://api.openai.com")
+ *
+ * Fixed parameters: temperature=0, max_tokens=500, timeout=30s
+ */
+
+const https = require("node:https");
+
+const JUDGE_TEMPERATURE = 0;
+const JUDGE_MAX_TOKENS = 500;
+const JUDGE_TIMEOUT_MS = 30_000;
+const MAX_RESPONSE_BODY = 100_000; // 100KB
+
+/**
+ * Build a compact summary of the agent output for the judge prompt.
+ * @param {object} state - AgentState returned by runFull
+ * @returns {string}
+ */
+function summarizeAgentOutput(state) {
+  const parts = [];
+
+  if (state.structuralTypeKey) {
+    parts.push(`Structural type: ${state.structuralTypeKey}`);
+  }
+
+  if (state.model) {
+    const nodes = Array.isArray(state.model.nodes) ? state.model.nodes.length : 0;
+    const elements = Array.isArray(state.model.elements) ? state.model.elements.length : 0;
+    parts.push(`Model: ${nodes} nodes, ${elements} elements`);
+  }
+
+  if (state.analysisResult) {
+    const keys = Object.keys(state.analysisResult).filter((k) => k !== "_raw").join(", ");
+    parts.push(`Analysis result keys: ${keys || "(present)"}`);
+    const displacements =
+      state.analysisResult.displacements || state.analysisResult.nodeDisplacements;
+    if (Array.isArray(displacements) && displacements.length > 0) {
+      parts.push(`Sample displacement: ${JSON.stringify(displacements[0])}`);
+    }
+    const reactions = state.analysisResult.reactions || state.analysisResult.nodeReactions;
+    if (Array.isArray(reactions) && reactions.length > 0) {
+      parts.push(`Sample reaction: ${JSON.stringify(reactions[0])}`);
+    }
+  }
+
+  if (state.report?.markdown) {
+    parts.push(`Report excerpt: ${state.report.markdown.slice(0, 500)}`);
+  }
+
+  return parts.length > 0 ? parts.join("\n") : "(no agent output)";
+}
+
+/**
+ * Build the judge prompt.
+ * @param {string} description - natural language criterion
+ * @param {string} agentOutput - compact summary of agent state
+ * @returns {string}
+ */
+function buildJudgePrompt(description, agentOutput) {
+  return [
+    "You are a structural engineering test evaluator.",
+    "Based on the agent output below, judge whether the following criterion is satisfied.",
+    "",
+    `Criterion: ${description}`,
+    "",
+    "Agent output:",
+    agentOutput,
+    "",
+    'Respond ONLY with a JSON object on a single line: {"pass": true} or {"pass": false, "reason": "brief explanation"}',
+    "Do not include any other text.",
+  ].join("\n");
+}
+
+/**
+ * Extract JSON from LLM response, handling markdown fences and nested braces.
+ * @param {string} response - raw LLM response
+ * @returns {object|null} parsed object or null
+ */
+function extractJudgeJson(response) {
+  let text = response.trim();
+
+  // Strip markdown code fences
+  const fenceMatch = text.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/);
+  if (fenceMatch) {
+    text = fenceMatch[1].trim();
+  }
+
+  // Try full parse first
+  try {
+    return JSON.parse(text);
+  } catch {
+    // Fall through to brace-matching
+  }
+
+  // Find balanced braces — greedy match from first { to last }
+  const start = text.indexOf("{");
+  const end = text.lastIndexOf("}");
+  if (start !== -1 && end > start) {
+    try {
+      return JSON.parse(text.slice(start, end + 1));
+    } catch {
+      return null;
+    }
+  }
+  return null;
+}
+
+/**
+ * Call the LLM judge API (HTTPS only).
+ * @param {string} prompt
+ * @returns {Promise<string>} raw response text
+ */
+function callLlmJudgeApi(prompt) {
+  const apiKey = process.env.LLM_JUDGE_API_KEY || process.env.LLM_API_KEY;
+  if (!apiKey) {
+    throw new Error("LLM_JUDGE_API_KEY or LLM_API_KEY is required for judge evaluation");
+  }
+  const model = process.env.LLM_JUDGE_MODEL || process.env.LLM_MODEL || "gpt-4o-mini";
+  const rawBase =
+    process.env.LLM_JUDGE_BASE_URL || process.env.LLM_BASE_URL || "https://api.openai.com";
+  let base = rawBase.endsWith("/") ? rawBase.slice(0, -1) : rawBase;
+
+  // Build URL handling bases that already include /v1 or other versioned paths
+  const chatPath = /\/v\d+$/.test(base) ? "/chat/completions" : "/v1/chat/completions";
+  const url = new URL(`${base}${chatPath}`);
+  if (url.protocol !== "https:") {
+    throw new Error(`Judge API must use HTTPS, got: ${url.protocol}`);
+  }
+
+  const bodyStr = JSON.stringify({
+    model,
+    temperature: JUDGE_TEMPERATURE,
+    max_tokens: JUDGE_MAX_TOKENS,
+    messages: [{ role: "user", content: prompt }],
+  });
+
+  return new Promise((resolve, reject) => {
+    let settled = false;
+
+    const req = https.request(
+      {
+        hostname: url.hostname,
+        port: url.port || 443,
+        path: url.pathname + url.search,
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Bearer ${apiKey}`,
+          "Content-Length": Buffer.byteLength(bodyStr),
+        },
+      },
+      (res) => {
+        let data = "";
+        res.on("data", (chunk) => {
+          data += chunk;
+          if (data.length > MAX_RESPONSE_BODY) {
+            if (!settled) {
+              settled = true;
+              req.destroy(new Error("Judge response body exceeded 100KB limit"));
+            }
+          }
+        });
+        res.on("end", () => {
+          if (settled) return;
+          if (res.statusCode && res.statusCode >= 400) {
+            settled = true;
+            reject(
+              new Error(`Judge API returned HTTP ${res.statusCode}`),
+            );
+            return;
+          }
+          try {
+            const parsed = JSON.parse(data);
+            const content = parsed.choices?.[0]?.message?.content ?? "";
+            settled = true;
+            resolve(content.trim());
+          } catch {
+            settled = true;
+            reject(new Error(`Failed to parse judge response: ${data.slice(0, 100)}`));
+          }
+        });
+      },
+    );
+
+    req.setTimeout(JUDGE_TIMEOUT_MS, () => {
+      if (!settled) {
+        settled = true;
+        req.destroy(new Error("LLM judge request timed out after 30s"));
+      }
+    });
+    req.on("error", (err) => {
+      if (!settled) {
+        settled = true;
+        reject(err);
+      }
+    });
+    req.write(bodyStr);
+    req.end();
+  });
+}
+
+/**
+ * Evaluate a natural_language assertion against the agent state using LLM-as-Judge.
+ *
+ * @param {string} description - the natural language criterion to evaluate
+ * @param {object} state - AgentState returned by runFull
+ * @returns {Promise<{ pass: boolean, reason?: string }>}
+ */
+async function evaluateNaturalLanguage(description, state) {
+  const agentOutput = summarizeAgentOutput(state);
+  const prompt = buildJudgePrompt(description, agentOutput);
+
+  try {
+    const response = await callLlmJudgeApi(prompt);
+    const result = extractJudgeJson(response);
+    if (!result) {
+      return { pass: false, reason: `Judge returned non-JSON: ${response.slice(0, 100)}` };
+    }
+    return {
+      pass: result.pass === true,
+      reason: typeof result.reason === "string" ? result.reason : undefined,
+    };
+  } catch (err) {
+    const msg = err instanceof Error ? err.message : String(err);
+    return { pass: false, reason: `Judge error: ${msg}` };
+  }
+}
+
+module.exports = { evaluateNaturalLanguage };
diff --git a/tests/llm-benchmark/lib/report.cjs b/tests/llm-benchmark/lib/report.cjs
index 1cc5dc02..46754949 100644
--- a/tests/llm-benchmark/lib/report.cjs
+++ b/tests/llm-benchmark/lib/report.cjs
@@ -3,6 +3,11 @@ function formatMetric(m) {
   return `  ${icon} ${m.metric.padEnd(16)} expected: ${m.expected.padEnd(30)} actual: ${m.actual}`;
 }
 
+function formatTurnMetric(m) {
+  const icon = m.pass ? "\u2713" : "\u2717";
+  return `    ${icon} ${m.metric.padEnd(14)} ${m.actual}`;
+}
+
 function printScenarioResult(scenario, evaluation) {
   const status = evaluation.allPassed ? "PASS" : "FAIL";
   process.stdout.write(`\n${"=".repeat(60)}\n`);
@@ -10,9 +15,30 @@ function printScenarioResult(scenario, evaluation) {
   if (scenario.description) {
     process.stdout.write(`     ${scenario.description}\n`);
   }
-  process.stdout.write(`\n`);
-  for (const m of evaluation.metrics) {
-    process.stdout.write(formatMetric(m) + "\n");
+
+  if (evaluation.turnResults && Array.isArray(evaluation.turnResults)) {
+    const turns = scenario.turns || [];
+    for (const { turnIndex, evaluation: turnEval } of evaluation.turnResults) {
+      const turnMsg = turns[turnIndex]?.message || "(turn)";
+      const preview = turnMsg.length > 40 ? turnMsg.slice(0, 40) + "..." : turnMsg;
+      process.stdout.write(`\n  Turn ${turnIndex + 1}: "${preview}"\n`);
+      for (const m of turnEval.metrics) {
+        if (m.metric === "duration") continue;
+        if (m.metric === "toolCalls" && m.pass) continue;
+        process.stdout.write(formatTurnMetric(m) + "\n");
+      }
+    }
+    process.stdout.write(`\n`);
+    for (const m of evaluation.metrics) {
+      if (m.metric === "duration") {
+        process.stdout.write(formatMetric(m) + "\n");
+      }
+    }
+  } else {
+    process.stdout.write(`\n`);
+    for (const m of evaluation.metrics) {
+      process.stdout.write(formatMetric(m) + "\n");
+    }
   }
   process.stdout.write(`${"=".repeat(60)}\n`);
 }
diff --git a/tests/llm-benchmark/lib/skill-trace.cjs b/tests/llm-benchmark/lib/skill-trace.cjs
new file mode 100644
index 00000000..e62cc420
--- /dev/null
+++ b/tests/llm-benchmark/lib/skill-trace.cjs
@@ -0,0 +1,39 @@
+/**
+ * Extract skill match results from Agent messages by parsing
+ * detect_structure_type tool result messages.
+ *
+ * LangGraph stores tool results as ToolMessage objects in state.messages.
+ * These have msg.name === 'detect_structure_type' and msg.content as JSON.
+ *
+ * @param {unknown[]} messages - state.messages from AgentState
+ * @returns {{ skillId: string|null, structureType: string|null, mappedType: string|null } | null}
+ */
+function extractSkillTrace(messages) {
+  if (!Array.isArray(messages)) return null;
+
+  // Scan from end to find the most recent routing decision (important for multi-turn)
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (!msg || typeof msg !== 'object') continue;
+
+    // LangChain ToolMessage: msg.name === tool name
+    if (msg.name !== 'detect_structure_type') continue;
+
+    try {
+      const content = typeof msg.content === 'string' ? JSON.parse(msg.content) : msg.content;
+      if (content && typeof content === 'object') {
+        return {
+          skillId: content.skillId || null,
+          structureType: content.key || null,
+          mappedType: content.mappedType || null,
+        };
+      }
+    } catch {
+      // ignore JSON parse errors; continue to next message
+    }
+  }
+
+  return null;
+}
+
+module.exports = { extractSkillTrace };
diff --git a/tests/llm-benchmark/runner.cjs b/tests/llm-benchmark/runner.cjs
index 80aea53a..dfecf95b 100644
--- a/tests/llm-benchmark/runner.cjs
+++ b/tests/llm-benchmark/runner.cjs
@@ -2,7 +2,6 @@ const path = require("node:path");
 const fs = require("node:fs");
 const { pathToFileURL } = require("node:url");
 
-const { resolveIntegrationContext } = require("../llm-integration/lib/context.js");
 const { evaluateScenario } = require("./lib/evaluate.cjs");
 const { printScenarioResult, printSummary, writeJsonOutput } = require("./lib/report.cjs");
 
@@ -46,9 +45,63 @@ function parseBenchmarkOptions(args) {
   return { scenarioId, outputPath };
 }
 
+function normalizeScenario(scenario) {
+  if (scenario.turns) {
+    return { ...scenario, _multiTurn: true };
+  }
+  return {
+    ...scenario,
+    _multiTurn: false,
+    turns: [
+      {
+        message: scenario.message,
+        assertions: scenario.expect?.assertions,
+      },
+    ],
+  };
+}
+
+function mergeTurnResults(scenario, turnResults, totalDurationMs) {
+  const allMetrics = [];
+  let allPassed = true;
+  let passed = 0;
+  let total = 0;
+
+  for (const { evaluation } of turnResults) {
+    // Keep per-turn toolCalls and duration as informational
+    const coreMetrics = evaluation.metrics.filter(
+      (m) => m.metric !== "toolCalls" && m.metric !== "duration",
+    );
+    allMetrics.push(...coreMetrics);
+    passed += coreMetrics.filter((m) => m.pass).length;
+    total += coreMetrics.length;
+    if (!evaluation.allPassed) allPassed = false;
+  }
+
+  // Add overall informational metrics
+  allMetrics.push({
+    metric: "duration",
+    pass: true,
+    expected: "(info)",
+    actual: `${(totalDurationMs / 1000).toFixed(1)}s`,
+  });
+
+  return {
+    scenarioId: scenario.id,
+    description: scenario.description || "",
+    passed,
+    total: total + 1,
+    allPassed,
+    metrics: allMetrics,
+    durationMs: totalDurationMs,
+    turnResults: turnResults.map((r) => ({ turnIndex: r.turnIndex, evaluation: r.evaluation })),
+  };
+}
+
 async function runBenchmark(rootDir, args) {
   const options = parseBenchmarkOptions(args);
-  const context = resolveIntegrationContext(rootDir);
+  const { resolveRegressionContext } = require("../regression/shared.js");
+  const context = resolveRegressionContext(rootDir);
 
   // Inject LLM env vars
   for (const [k, v] of Object.entries(context.env)) {
@@ -89,36 +142,100 @@ async function runBenchmark(rootDir, args) {
 
   const results = [];
 
-  for (const scenario of scenarios) {
-    process.stdout.write(`\nRunning: ${scenario.id}...\n`);
-    const startTime = Date.now();
+  for (const rawScenario of scenarios) {
+    const scenario = normalizeScenario(rawScenario);
+    const maxRetries = Math.max(0, typeof scenario.maxRetries === "number" ? scenario.maxRetries : 0);
+    let attempt = 0;
+    let lastEvaluation = null;
+
+    while (attempt <= maxRetries) {
+      if (attempt > 0) {
+        process.stdout.write(`  Retrying (attempt ${attempt + 1}/${maxRetries + 1})...\n`);
+      } else {
+        process.stdout.write(`\nRunning: ${scenario.id}...\n`);
+      }
+
+      const scenarioStart = Date.now();
+      let executionError = false;
+      const turnResults = [];
+      const conversationId = `bench-${scenario.id}-${scenarioStart}-${attempt}`;
+
+      // Suppress noisy agent logs during execution
+      const prevLogLevel = process.env.LOG_LEVEL;
+      process.env.LOG_LEVEL = "warn";
+
+      try {
+        for (let i = 0; i < scenario.turns.length; i++) {
+          const turn = scenario.turns[i];
+          const turnStart = Date.now();
+
+          const state = await service.runFull({
+            message: turn.message,
+            conversationId,
+            context: { locale: scenario.locale || "zh" },
+          });
+
+          const turnDurationMs = Date.now() - turnStart;
+
+          if (turn.assertions && turn.assertions.length > 0) {
+            const turnScenario = { ...scenario, expect: { assertions: turn.assertions } };
+            const evaluation = await evaluateScenario(turnScenario, state, turnDurationMs);
+            turnResults.push({ turnIndex: i, evaluation });
+          }
+        }
+      } catch (err) {
+        executionError = true;
+        const message = err instanceof Error ? err.message : String(err);
+        process.stdout.write(`  error: ${message}\n`);
+        turnResults.push({
+          turnIndex: scenario.turns.length - 1,
+          evaluation: {
+            scenarioId: scenario.id,
+            description: scenario.description || "",
+            passed: 0,
+            total: 1,
+            allPassed: false,
+            metrics: [{ metric: "execution", pass: false, expected: "no error", actual: message }],
+            durationMs: Date.now() - scenarioStart,
+          },
+        });
+      } finally {
+        if (prevLogLevel === undefined) {
+          delete process.env.LOG_LEVEL;
+        } else {
+          process.env.LOG_LEVEL = prevLogLevel;
+        }
+      }
+
+      const totalDurationMs = Date.now() - scenarioStart;
+
+      if (turnResults.length > 0) {
+        if (scenario._multiTurn) {
+          lastEvaluation = mergeTurnResults(scenario, turnResults, totalDurationMs);
+        } else {
+          lastEvaluation = turnResults[0].evaluation;
+        }
+      }
+
+      if (lastEvaluation && lastEvaluation.allPassed) break;
+      if (executionError) break;
+      attempt += 1;
+    }
 
-    try {
-      const state = await service.runFull({
-        message: scenario.message,
-        conversationId: `bench-${scenario.id}-${startTime}`,
-        context: { locale: scenario.locale || "zh" },
-      });
-
-      const durationMs = Date.now() - startTime;
-      const evaluation = evaluateScenario(scenario, state, durationMs);
-      printScenarioResult(scenario, evaluation);
-      results.push(evaluation);
-    } catch (err) {
-      const durationMs = Date.now() - startTime;
-      const message = err instanceof Error ? err.message : String(err);
-      process.stdout.write(`\nFAIL  ${scenario.id}\n`);
-      process.stdout.write(`  error: ${message}\n`);
-      results.push({
+    if (!lastEvaluation) {
+      lastEvaluation = {
         scenarioId: scenario.id,
         description: scenario.description || "",
         passed: 0,
         total: 1,
         allPassed: false,
-        metrics: [{ metric: "execution", pass: false, expected: "no error", actual: message }],
-        durationMs,
-      });
+        metrics: [{ metric: "execution", pass: false, expected: "no evaluation produced", actual: "(none)" }],
+        durationMs: 0,
+      };
     }
+
+    printScenarioResult(rawScenario, lastEvaluation);
+    results.push(lastEvaluation);
   }
 
   printSummary(results);
diff --git a/tests/llm-benchmark/scenarios/beam.json b/tests/llm-benchmark/scenarios/beam.json
index 775bff8d..43fb7b69 100644
--- a/tests/llm-benchmark/scenarios/beam.json
+++ b/tests/llm-benchmark/scenarios/beam.json
@@ -2,40 +2,84 @@
   {
     "id": "beam-static-6m",
     "description": "简支梁6米均布荷载静力分析",
+    "category": "static-analysis",
+    "tags": ["beam", "static", "zh", "basic"],
     "message": "简支梁6米，均布荷载20kN/m，请进行静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "beam",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 2,
-      "minElements": 1
+      "skills": { "primary": "beam", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "beam" },
+        { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 2, "minElements": 1 },
+        { "type": "has_analysis" },
+        { "type": "natural_language", "description": "Analysis results should include displacement or deflection values" }
+      ]
     }
   },
   {
     "id": "beam-cantilever-point-load",
     "description": "悬臂梁端部集中力静力分析",
+    "category": "static-analysis",
+    "tags": ["beam", "static", "zh", "cantilever"],
     "message": "一根悬臂梁，长3米，端部集中力10kN，请进行静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "beam",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 2,
-      "minElements": 1
+      "skills": { "primary": "beam", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "beam" },
+        { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 2, "minElements": 1 },
+        { "type": "has_analysis" },
+        { "type": "natural_language", "description": "Analysis results should reflect a cantilever boundary condition (fixed at one end, free at the other)" }
+      ]
     }
   },
   {
     "id": "beam-simply-supported-en",
     "description": "Simply supported beam with UDL static analysis",
+    "category": "static-analysis",
+    "tags": ["beam", "static", "en", "basic"],
     "message": "A simply supported beam, span 8m, uniformly distributed load 15kN/m, please run a static analysis",
     "locale": "en",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "beam",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 2,
-      "minElements": 1
+      "skills": { "primary": "beam", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "beam" },
+        { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 2, "minElements": 1 },
+        { "type": "has_analysis" },
+        { "type": "natural_language", "description": "Analysis results should include mid-span deflection or maximum displacement" }
+      ]
     }
+  },
+  {
+    "id": "beam-multi-turn-incomplete",
+    "description": "简支梁参数不全→多轮补全→分析",
+    "category": "static-analysis",
+    "tags": ["beam", "static", "zh", "multi-turn"],
+    "locale": "zh",
+    "maxRetries": 1,
+    "turns": [
+      {
+        "message": "帮我分析一根简支梁",
+        "assertions": [
+          { "type": "structural_type", "expected": "beam" },
+          { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] },
+          { "type": "has_interaction_questions" }
+        ]
+      },
+      {
+        "message": "跨度6米，均布荷载20kN/m",
+        "assertions": [
+          { "type": "has_model", "minNodes": 2, "minElements": 1 },
+          { "type": "has_analysis" },
+          { "type": "natural_language", "description": "Analysis results should include displacement values for a 6m beam under 20kN/m UDL" }
+        ]
+      }
+    ]
   }
 ]
diff --git a/tests/llm-benchmark/scenarios/double-span-beam.json b/tests/llm-benchmark/scenarios/double-span-beam.json
index 4b42df38..b7811570 100644
--- a/tests/llm-benchmark/scenarios/double-span-beam.json
+++ b/tests/llm-benchmark/scenarios/double-span-beam.json
@@ -2,14 +2,19 @@
   {
     "id": "double-span-beam-static",
     "description": "双跨连续梁静力分析",
+    "category": "static-analysis",
+    "tags": ["beam", "continuous", "static", "zh"],
     "message": "两跨连续梁，跨度5m和6m，均布荷载12kN/m，做静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "double-span-beam",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 3,
-      "minElements": 2
+      "skills": { "primary": "double-span-beam", "mayAlsoMatch": ["beam", "generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "double-span-beam" },
+        { "type": "skill_match", "primary": "double-span-beam", "mayAlsoMatch": ["beam", "generic"] },
+        { "type": "has_model", "minNodes": 3, "minElements": 2 },
+        { "type": "has_analysis" }
+      ]
     }
   }
 ]
diff --git a/tests/llm-benchmark/scenarios/frame.json b/tests/llm-benchmark/scenarios/frame.json
index 10755cbc..e53200dd 100644
--- a/tests/llm-benchmark/scenarios/frame.json
+++ b/tests/llm-benchmark/scenarios/frame.json
@@ -2,40 +2,57 @@
   {
     "id": "frame-2story-1bay-static",
     "description": "2层单跨钢框架静力分析",
+    "category": "static-analysis",
+    "tags": ["frame", "static", "zh", "multi-story"],
     "message": "2层单跨钢框架，层高3.6m，跨度6m，每层楼面荷载10kN/m2，请进行静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "frame",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 6,
-      "minElements": 6
+      "skills": { "primary": "frame", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "frame" },
+        { "type": "skill_match", "primary": "frame", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 6, "minElements": 6 },
+        { "type": "has_analysis" },
+        { "type": "natural_language", "description": "Analysis results should include story drift or inter-story displacement values" }
+      ]
     }
   },
   {
     "id": "frame-3story-2bay-static",
     "description": "3层2跨框架静力分析",
+    "category": "static-analysis",
+    "tags": ["frame", "static", "zh", "multi-bay"],
     "message": "3层2跨框架，层高3.3m，跨度5.4m和6m，每层楼面荷载15kN/m，请进行静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "frame",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 12,
-      "minElements": 12
+      "skills": { "primary": "frame", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "frame" },
+        { "type": "skill_match", "primary": "frame", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 12, "minElements": 12 },
+        { "type": "has_analysis" },
+        { "type": "natural_language", "description": "Model should represent a 3-story 2-bay frame with correct node and element count" }
+      ]
     }
   },
   {
     "id": "frame-steel-en",
     "description": "2-story single-bay steel frame static analysis",
+    "category": "static-analysis",
+    "tags": ["frame", "static", "en", "multi-story"],
     "message": "2-story single-bay steel frame, story height 3.6m, bay width 6m, floor load 10kN/m2, please run a static analysis",
     "locale": "en",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "frame",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 6,
-      "minElements": 6
+      "skills": { "primary": "frame", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "frame" },
+        { "type": "skill_match", "primary": "frame", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 6, "minElements": 6 },
+        { "type": "has_analysis" }
+      ]
     }
   }
 ]
diff --git a/tests/llm-benchmark/scenarios/portal-frame.json b/tests/llm-benchmark/scenarios/portal-frame.json
index cdce0322..44f6f3ec 100644
--- a/tests/llm-benchmark/scenarios/portal-frame.json
+++ b/tests/llm-benchmark/scenarios/portal-frame.json
@@ -2,27 +2,37 @@
   {
     "id": "portal-frame-static-18m",
     "description": "门式刚架18米跨度静力分析",
+    "category": "static-analysis",
+    "tags": ["portal-frame", "static", "zh"],
     "message": "门式刚架，跨度18m，高度7m，屋面荷载6kN/m，做静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "portal-frame",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 3,
-      "minElements": 3
+      "skills": { "primary": "portal-frame", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "portal-frame" },
+        { "type": "skill_match", "primary": "portal-frame", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 3, "minElements": 3 },
+        { "type": "has_analysis" }
+      ]
     }
   },
   {
     "id": "portal-frame-static-en",
     "description": "Portal frame 21m span static analysis",
+    "category": "static-analysis",
+    "tags": ["portal-frame", "static", "en"],
     "message": "Portal frame, span 21m, height 7.5m, roof load 8kN/m, please run a static analysis",
     "locale": "en",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "portal-frame",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 3,
-      "minElements": 3
+      "skills": { "primary": "portal-frame", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "portal-frame" },
+        { "type": "skill_match", "primary": "portal-frame", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 3, "minElements": 3 },
+        { "type": "has_analysis" }
+      ]
     }
   }
 ]
diff --git a/tests/llm-benchmark/scenarios/truss.json b/tests/llm-benchmark/scenarios/truss.json
index 59ce7b1d..ebf0f042 100644
--- a/tests/llm-benchmark/scenarios/truss.json
+++ b/tests/llm-benchmark/scenarios/truss.json
@@ -2,14 +2,19 @@
   {
     "id": "truss-triangle-static",
     "description": "三角桁架静力分析",
+    "category": "static-analysis",
+    "tags": ["truss", "static", "zh"],
     "message": "三角桁架，跨度12m，高3m，节点荷载20kN，做静力分析",
     "locale": "zh",
+    "maxRetries": 2,
     "expect": {
-      "structuralType": "truss",
-      "hasModel": true,
-      "hasAnalysis": true,
-      "minNodes": 3,
-      "minElements": 3
+      "skills": { "primary": "truss", "mayAlsoMatch": ["generic"] },
+      "assertions": [
+        { "type": "structural_type", "expected": "truss" },
+        { "type": "skill_match", "primary": "truss", "mayAlsoMatch": ["generic"] },
+        { "type": "has_model", "minNodes": 3, "minElements": 3 },
+        { "type": "has_analysis" }
+      ]
     }
   }
 ]
diff --git a/tests/llm-integration/lib/context.js b/tests/llm-integration/lib/context.js
deleted file mode 100644
index d80f4a4f..00000000
--- a/tests/llm-integration/lib/context.js
+++ /dev/null
@@ -1,38 +0,0 @@
-const path = require("node:path");
-const runtime = require("../../../scripts/cli/runtime");
-
-/**
- * Resolve the integration test context: paths, env vars, and pre-flight checks.
- * Throws early if LLM_API_KEY is missing.
- */
-function resolveIntegrationContext(rootDir) {
-  const projectRoot = runtime.resolveProjectRoot(rootDir);
-  const { paths, env } = runtime.loadProjectEnvironment(projectRoot);
-
-  const llmApiKey = process.env.LLM_API_KEY || env.LLM_API_KEY || "";
-  const llmModel = process.env.LLM_MODEL || env.LLM_MODEL || "";
-  const llmBaseUrl = process.env.LLM_BASE_URL || env.LLM_BASE_URL || "";
-
-  if (!llmApiKey) {
-    throw new Error(
-      "LLM_API_KEY is required for integration tests.\n" +
-      "Set it via environment variable or .env file."
-    );
-  }
-
-  return {
-    rootDir: projectRoot,
-    paths,
-    env: {
-      ...env,
-      LLM_API_KEY: llmApiKey,
-      LLM_MODEL: llmModel,
-      LLM_BASE_URL: llmBaseUrl,
-      DATABASE_URL: `file:${path
-        .join(projectRoot, ".structureclaw", "data", "structureclaw-llm-test.db")
-        .replace(/\\/gu, "/")}`,
-    },
-  };
-}
-
-module.exports = { resolveIntegrationContext };
diff --git a/tests/llm-integration/lib/executors.cjs b/tests/llm-integration/lib/executors.cjs
deleted file mode 100644
index 2c34f6d3..00000000
--- a/tests/llm-integration/lib/executors.cjs
+++ /dev/null
@@ -1,221 +0,0 @@
-const {
-  assert,
-  assertToolCalls,
-  applyCriticalMissingAssertions,
-} = require("./assertions.js");
-
-function resolveLocale(locale) {
-  return locale === "zh" ? "zh" : "en";
-}
-
-function resolveCaseExpect(testCase = {}) {
-  if (testCase.expect && typeof testCase.expect === "object") {
-    return testCase.expect;
-  }
-  if (testCase.assertions && typeof testCase.assertions === "object") {
-    return testCase.assertions;
-  }
-  return {};
-}
-
-function shouldEnableAutoCodeCheck(expected = {}) {
-  if (typeof expected.autoCodeCheck === "boolean") {
-    return expected.autoCodeCheck;
-  }
-
-  return Array.isArray(expected.toolCalls) && expected.toolCalls.includes("run_code_check");
-}
-
-function attachExecutionResult(error, key, result) {
-  if (error && typeof error === "object" && !Object.prototype.hasOwnProperty.call(error, key)) {
-    error[key] = result;
-  }
-  return error;
-}
-
-async function runRoutingTest(runtime, testCase) {
-  const locale = resolveLocale(testCase.locale);
-  const message = testCase.messages[0];
-  const match = await runtime.detectStructuralType(message, locale, undefined, testCase.enabledSkillIds);
-  const expected = resolveCaseExpect(testCase);
-
-  if (expected.inferredType) {
-    const actualKey = match.mappedType || match.key;
-    assert(
-      actualKey === expected.inferredType || match.skillId === expected.inferredType,
-      `expected inferredType="${expected.inferredType}", got key="${match.key}" mappedType="${match.mappedType}" skillId="${match.skillId}"`
-    );
-  }
-  if (expected.structuralTypeKey) {
-    assert(
-      match.key === expected.structuralTypeKey || match.mappedType === expected.structuralTypeKey,
-      `expected structuralTypeKey="${expected.structuralTypeKey}", got key="${match.key}" mappedType="${match.mappedType}"`
-    );
-  }
-}
-
-async function runExtractionTest(runtime, llm, testCase) {
-  const locale = resolveLocale(testCase.locale);
-  const message = testCase.messages[0];
-  const expected = resolveCaseExpect(testCase);
-
-  const result = await runtime.textToModelDraft(llm, message, undefined, locale, testCase.enabledSkillIds);
-
-  if (expected.inferredType) {
-    assert(
-      result.inferredType === expected.inferredType,
-      `expected inferredType="${expected.inferredType}", got "${result.inferredType}"`
-    );
-  }
-
-  if (
-    expected.criticalMissing !== undefined
-    || expected.criticalMissingIncludes
-    || expected.criticalMissingNotIncludes
-  ) {
-    applyCriticalMissingAssertions(result.missingFields || [], expected);
-  }
-
-  if (expected.draftPatch) {
-    try {
-      assertDraftPatch(result.stateToPersist || {}, expected.draftPatch);
-    } catch (error) {
-      throw attachExecutionResult(error, "draftResult", result);
-    }
-  }
-
-  return result;
-}
-
-function assertDraftPatch(state, expectedPatch) {
-  for (const [key, expectedValue] of Object.entries(expectedPatch)) {
-    const actualValue = state[key];
-    if (expectedValue === null || expectedValue === undefined) continue;
-
-    if (typeof expectedValue === "object" && expectedValue.value !== undefined) {
-      const tolerance = expectedValue.tolerance || 0.05;
-      const expected = expectedValue.value;
-
-      if (Array.isArray(expected)) {
-        assert(Array.isArray(actualValue), `expected ${key} to be an array, got ${typeof actualValue}: ${actualValue}`);
-        assert(actualValue.length === expected.length, `expected ${key} length ${expected.length}, got ${actualValue.length}`);
-        for (let i = 0; i < expected.length; i++) {
-          const diff = Math.abs(actualValue[i] - expected[i]) / Math.abs(expected[i] || 1);
-          assert(diff <= tolerance, `expected ${key}[${i}]=${expected[i]} (±${(tolerance * 100).toFixed(0)}%), got ${actualValue[i]}`);
-        }
-      } else {
-        assert(typeof actualValue === "number", `expected ${key} to be a number, got ${typeof actualValue}: ${actualValue}`);
-        const diff = Math.abs(actualValue - expected) / Math.abs(expected || 1);
-        assert(diff <= tolerance, `expected ${key}=${expected} (±${(tolerance * 100).toFixed(0)}%), got ${actualValue}`);
-      }
-    } else {
-      assert(
-        actualValue === expectedValue,
-        `expected ${key}="${expectedValue}", got "${actualValue}"`
-      );
-    }
-  }
-}
-
-async function runPipelineTest(agentService, testCase) {
-  const locale = resolveLocale(testCase.locale);
-  const message = testCase.messages[0];
-  const expected = resolveCaseExpect(testCase);
-
-  const result = await agentService.run({
-    message,
-    conversationId: `llm-test-${testCase.id}-${Date.now()}`,
-    traceId: `trace-${testCase.id}`,
-    context: {
-      locale,
-      skillIds: testCase.enabledSkillIds,
-      autoAnalyze: true,
-      includeReport: expected.expectReport !== false,
-      autoCodeCheck: shouldEnableAutoCodeCheck(expected),
-    },
-  });
-
-  try {
-    if (typeof expected.success === "boolean") {
-      assert(
-        Boolean(result.success) === expected.success,
-        `expected pipeline success=${expected.success}, got ${Boolean(result.success)}`
-      );
-    }
-
-    if (expected.toolCalls) {
-      assertToolCalls(result.toolCalls || [], expected.toolCalls);
-    }
-
-    const analysisCall = result.toolCalls?.find((tc) => tc.tool === "run_analysis");
-    if (expected.analysisSuccess === true) {
-      assert(
-        analysisCall,
-        "expected run_analysis to execute, but no run_analysis tool call was recorded"
-      );
-    }
-
-    if (expected.analysisSuccess !== false && result.toolCalls) {
-      if (analysisCall) {
-        assert(
-          analysisCall.status === "success",
-          `run_analysis should succeed, got status="${analysisCall.status}"${analysisCall.error ? `, error: ${analysisCall.error}` : ""}`
-        );
-      }
-    }
-  } catch (error) {
-    throw attachExecutionResult(error, "pipelineResult", result);
-  }
-
-  return result;
-}
-
-async function runClarificationTest(runtime, llm, testCase) {
-  const locale = resolveLocale(testCase.locale);
-  let currentState = undefined;
-  let lastResult = null;
-
-  for (let i = 0; i < testCase.turns.length; i++) {
-    const turn = testCase.turns[i];
-    const result = await runtime.textToModelDraft(llm, turn.message, currentState, locale, testCase.enabledSkillIds);
-    currentState = result.stateToPersist;
-    lastResult = result;
-
-    const expected = turn.assertions;
-
-    if (
-      expected.criticalMissing !== undefined
-      || expected.criticalMissingIncludes
-      || expected.criticalMissingNotIncludes
-    ) {
-      applyCriticalMissingAssertions(result.missingFields || [], expected);
-    }
-    if (expected.modelBuilt !== undefined) {
-      if (expected.modelBuilt) {
-        assert(result.model !== undefined, `expected model to be built on turn ${i + 1}, but it was undefined`);
-      } else {
-        assert(result.model === undefined, `expected model NOT to be built on turn ${i + 1}`);
-      }
-    }
-    if (expected.draftPatch) {
-      try {
-        assertDraftPatch(result.stateToPersist || {}, expected.draftPatch);
-      } catch (error) {
-        throw attachExecutionResult(error, "draftResult", result);
-      }
-    }
-  }
-
-  return lastResult;
-}
-
-module.exports = {
-  resolveCaseExpect,
-  shouldEnableAutoCodeCheck,
-  attachExecutionResult,
-  runRoutingTest,
-  runExtractionTest,
-  runPipelineTest,
-  runClarificationTest,
-  assertDraftPatch,
-};
diff --git a/tests/llm-integration/lib/executors.test.cjs b/tests/llm-integration/lib/executors.test.cjs
deleted file mode 100644
index 725c205d..00000000
--- a/tests/llm-integration/lib/executors.test.cjs
+++ /dev/null
@@ -1,233 +0,0 @@
-const test = require("node:test");
-const nodeAssert = require("node:assert/strict");
-
-const {
-  resolveCaseExpect,
-  runRoutingTest,
-  runExtractionTest,
-  runPipelineTest,
-} = require("./executors.cjs");
-
-test("resolveCaseExpect prefers v2 expect blocks", () => {
-  const expected = resolveCaseExpect({
-    expect: { inferredType: "frame" },
-    assertions: { inferredType: "beam" },
-  });
-
-  nodeAssert.deepEqual(expected, { inferredType: "frame" });
-});
-
-test("runRoutingTest forwards enabledSkillIds and uses normalized expect", async () => {
-  const calls = [];
-  const runtime = {
-    async detectStructuralType(message, locale, currentState, skillIds) {
-      calls.push({ message, locale, currentState, skillIds });
-      return { key: "frame", mappedType: "frame", skillId: "frame" };
-    },
-  };
-
-  await runRoutingTest(runtime, {
-    locale: "en",
-    messages: ["3-story steel frame"],
-    enabledSkillIds: ["frame"],
-    expect: {
-      inferredType: "frame",
-      structuralTypeKey: "frame",
-    },
-    assertions: {
-      inferredType: "beam",
-    },
-  });
-
-  nodeAssert.deepEqual(calls, [
-    {
-      message: "3-story steel frame",
-      locale: "en",
-      currentState: undefined,
-      skillIds: ["frame"],
-    },
-  ]);
-});
-
-test("runExtractionTest uses normalized expect blocks", async () => {
-  const runtime = {
-    async textToModelDraft(_llm, message, currentState, locale, skillIds) {
-      nodeAssert.equal(message, "3-story steel frame");
-      nodeAssert.equal(currentState, undefined);
-      nodeAssert.equal(locale, "en");
-      nodeAssert.deepEqual(skillIds, ["frame"]);
-      return {
-        inferredType: "frame",
-        missingFields: [],
-        stateToPersist: { storyCount: 3 },
-      };
-    },
-  };
-
-  const result = await runExtractionTest(runtime, {}, {
-    locale: "en",
-    messages: ["3-story steel frame"],
-    enabledSkillIds: ["frame"],
-    expect: {
-      inferredType: "frame",
-      criticalMissing: [],
-      draftPatch: { storyCount: 3 },
-    },
-    assertions: {
-      inferredType: "beam",
-    },
-  });
-
-  nodeAssert.equal(result.inferredType, "frame");
-});
-
-test("runPipelineTest derives context from normalized expect blocks", async () => {
-  const calls = [];
-  const agentService = {
-    async run(input) {
-      calls.push(input);
-      return {
-        toolCalls: [
-          { tool: "build_model", status: "success" },
-          { tool: "run_analysis", status: "success" },
-        ],
-      };
-    },
-  };
-
-  const result = await runPipelineTest(agentService, {
-    id: "frame-static-basic#specific",
-    locale: "en",
-    messages: ["3-story steel frame"],
-    enabledSkillIds: ["frame", "opensees-static"],
-    expect: {
-      toolCalls: ["build_model", "run_analysis"],
-      expectReport: false,
-    },
-    assertions: {
-      expectReport: true,
-    },
-  });
-
-  nodeAssert.equal(result.toolCalls.length, 2);
-  nodeAssert.equal(calls.length, 1);
-  nodeAssert.equal(calls[0].context.includeReport, false);
-  nodeAssert.deepEqual(calls[0].context.skillIds, ["frame", "opensees-static"]);
-  nodeAssert.equal(calls[0].context.autoCodeCheck, false);
-});
-
-test("runPipelineTest enables code check when the fixture expects run_code_check", async () => {
-  const calls = [];
-  const agentService = {
-    async run(input) {
-      calls.push(input);
-      return {
-        toolCalls: [
-          { tool: "build_model", status: "success" },
-          { tool: "run_analysis", status: "success" },
-          { tool: "run_code_check", status: "success" },
-        ],
-      };
-    },
-  };
-
-  await runPipelineTest(agentService, {
-    id: "frame-static-basic#specific",
-    locale: "en",
-    messages: ["2-story single-bay steel frame"],
-    enabledSkillIds: ["frame", "opensees-static"],
-    expect: {
-      toolCalls: ["build_model", "run_analysis", "run_code_check"],
-    },
-  });
-
-  nodeAssert.equal(calls.length, 1);
-  nodeAssert.equal(calls[0].context.autoCodeCheck, true);
-});
-
-test("runPipelineTest attaches pipeline results to assertion failures", async () => {
-  const agentService = {
-    async run() {
-      return {
-        toolCalls: [
-          { tool: "build_model", status: "success" },
-          { tool: "run_analysis", status: "success" },
-        ],
-      };
-    },
-  };
-
-  let error;
-  try {
-    await runPipelineTest(agentService, {
-      id: "frame-static-basic#specific",
-      locale: "en",
-      messages: ["2-story single-bay steel frame"],
-      enabledSkillIds: ["frame", "opensees-static", "code-check-gb50017"],
-      expect: {
-        toolCalls: ["build_model", "run_analysis", "run_code_check"],
-      },
-    });
-  } catch (err) {
-    error = err;
-  }
-
-  nodeAssert.ok(error.pipelineResult);
-  nodeAssert.deepEqual(
-    error.pipelineResult.toolCalls.map((call) => call.tool),
-    ["build_model", "run_analysis"],
-  );
-});
-
-test("runPipelineTest asserts explicit pipeline success flags", async () => {
-  const agentService = {
-    async run() {
-      return {
-        success: false,
-        toolCalls: [
-          { tool: "build_model", status: "success" },
-          { tool: "run_analysis", status: "success" },
-        ],
-      };
-    },
-  };
-
-  await nodeAssert.rejects(
-    () => runPipelineTest(agentService, {
-      id: "truss-static-basic#specific",
-      locale: "zh",
-      messages: ["三角桁架，跨度12m，高3m，节点荷载20kN，做静力分析"],
-      enabledSkillIds: ["truss", "opensees-static"],
-      expect: {
-        success: true,
-        toolCalls: ["build_model", "run_analysis"],
-      },
-    }),
-    /expected pipeline success=true, got false/,
-  );
-});
-
-test("runPipelineTest requires run_analysis when analysisSuccess is true", async () => {
-  const agentService = {
-    async run() {
-      return {
-        success: true,
-        toolCalls: [
-          { tool: "build_model", status: "success" },
-        ],
-      };
-    },
-  };
-
-  await nodeAssert.rejects(
-    () => runPipelineTest(agentService, {
-      id: "frame-pipeline-multi-bay-zh#legacy",
-      locale: "zh",
-      messages: ["3层2跨框架，层高3.3m，跨度5.4m和6m，每层楼面荷载15kN/m"],
-      expect: {
-        analysisSuccess: true,
-      },
-    }),
-    /expected run_analysis to execute, but no run_analysis tool call was recorded/,
-  );
-});
diff --git a/tests/llm-integration/lib/real-llm-client.cjs b/tests/llm-integration/lib/real-llm-client.cjs
deleted file mode 100644
index f8fa76c0..00000000
--- a/tests/llm-integration/lib/real-llm-client.cjs
+++ /dev/null
@@ -1,126 +0,0 @@
-const { createRequire } = require("node:module");
-const fs = require("node:fs");
-const path = require("node:path");
-
-/**
- * Create a real LLM client using the backend's @langchain/openai dependency.
- * Uses `apiKey` (v1.x) parameter name. Reads config from process.env which
- * must be set before calling this function.
- *
- * The returned client is wrapped with LLM call logging so that every
- * invoke() call is recorded to .structureclaw/logs/llm-calls-test.jsonl.
- *
- * @param {object} context - Integration context with env vars
- * @param {number} [temperature=0] - LLM temperature (0 for deterministic)
- * @returns {import('@langchain/openai').ChatOpenAI | null}
- */
-function createRealLlmClient(context, temperature = 0) {
-  const apiKey = context.env.LLM_API_KEY || process.env.LLM_API_KEY || "";
-  if (!apiKey) {
-    return null;
-  }
-
-  const backendRequire = createRequire(
-    path.join(context.rootDir, "backend", "package.json")
-  );
-  const { ChatOpenAI } = backendRequire("@langchain/openai");
-
-  const model = new ChatOpenAI({
-    model: context.env.LLM_MODEL || process.env.LLM_MODEL || undefined,
-    temperature,
-    timeout: parseInt(context.env.LLM_TIMEOUT_MS || process.env.LLM_TIMEOUT_MS || "90000", 10),
-    maxRetries: parseInt(context.env.LLM_MAX_RETRIES || process.env.LLM_MAX_RETRIES || "1", 10),
-    apiKey,
-    configuration: {
-      baseURL: context.env.LLM_BASE_URL || process.env.LLM_BASE_URL || undefined,
-    },
-  });
-
-  return wrapWithLogging(model, context);
-}
-
-/**
- * Self-contained LLM call logger. Writes one JSON line per invoke() call
- * to <rootDir>/.structureclaw/logs/llm-calls-test.jsonl — same format as the backend's
- * LlmCallLogger so the CI artifact upload picks it up automatically.
- */
-let _logStream = null;
-let _logDisabled = false;
-function ensureLogStream(rootDir) {
-  if (_logDisabled) return null;
-  if (_logStream) return _logStream;
-  if (process.env.LLM_LOG_ENABLED === "false") { _logDisabled = true; return null; }
-  try {
-    const dir = process.env.LLM_LOG_DIR || path.join(rootDir, ".structureclaw", "logs");
-    fs.mkdirSync(dir, { recursive: true });
-    _logStream = fs.createWriteStream(path.join(dir, "llm-calls-test.jsonl"), { flags: "a" });
-    _logStream.on("error", () => { _logDisabled = true; _logStream = null; });
-    return _logStream;
-  } catch {
-    _logDisabled = true;
-    return null;
-  }
-}
-
-function wrapWithLogging(model, context) {
-  const stream = ensureLogStream(context.rootDir);
-  if (!stream) return model;
-
-  const modelName = context.env.LLM_MODEL || process.env.LLM_MODEL || "unknown";
-  const originalInvoke = model.invoke.bind(model);
-
-  function safeStringify(val) {
-    if (typeof val === "string") return val;
-    try {
-      const result = JSON.stringify(val);
-      return result === undefined ? String(val) : result;
-    } catch {
-      return String(val);
-    }
-  }
-
-  function writeLogEntry(entry) {
-    try {
-      stream.write(JSON.stringify(entry) + "\n");
-    } catch {
-      // Non-blocking: never crash on log write failure.
-    }
-  }
-
-  model.invoke = async function (input, options) {
-    const promptStr = safeStringify(input);
-    const start = Date.now();
-    try {
-      const result = await originalInvoke(input, options);
-      const content = safeStringify(result.content);
-      writeLogEntry({
-        timestamp: new Date().toISOString(),
-        model: modelName,
-        prompt: promptStr,
-        response: content,
-        promptChars: promptStr.length,
-        responseChars: content.length,
-        durationMs: Date.now() - start,
-        success: true,
-      });
-      return result;
-    } catch (error) {
-      writeLogEntry({
-        timestamp: new Date().toISOString(),
-        model: modelName,
-        prompt: promptStr,
-        response: null,
-        promptChars: promptStr.length,
-        responseChars: 0,
-        durationMs: Date.now() - start,
-        success: false,
-        error: String(error),
-      });
-      throw error;
-    }
-  };
-
-  return model;
-}
-
-module.exports = { createRealLlmClient };
diff --git a/tests/llm-integration/lib/reporting.cjs b/tests/llm-integration/lib/reporting.cjs
deleted file mode 100644
index 492fa1cf..00000000
--- a/tests/llm-integration/lib/reporting.cjs
+++ /dev/null
@@ -1,47 +0,0 @@
-const fs = require("node:fs");
-const path = require("node:path");
-
-/**
- * Format a single test case summary for console output.
- */
-function formatCaseSummary(testCase, observedTrace, status) {
-  const tools = (observedTrace.toolCalls || [])
-    .filter((call) => call.status === "success")
-    .map((call) => call.tool)
-    .join(" -> ") || "(none)";
-
-  return [
-    `  ${status} ${testCase.id} [${testCase.category}/${testCase.variant}]`,
-    `    enabled: ${(observedTrace.enabledSkillIds || []).join(", ") || "(auto)"}`,
-    `    activated: ${(observedTrace.activatedSkillIds || []).join(", ") || "(none)"}`,
-    `    structural: ${observedTrace.structuralSkillId || "(none)"}`,
-    `    analysis: ${observedTrace.analysisSkillId || "(none)"}`,
-    `    tools: ${tools}`
-  ].join("\n");
-}
-
-/**
- * Append a record to a JSON artifact file.
- * Creates the file with an array if it doesn't exist.
- */
-function appendArtifactRecord(outputPath, record) {
-  const dir = path.dirname(outputPath);
-  if (!fs.existsSync(dir)) {
-    fs.mkdirSync(dir, { recursive: true });
-  }
-
-  let records = [];
-  if (fs.existsSync(outputPath)) {
-    try {
-      const parsed = JSON.parse(fs.readFileSync(outputPath, "utf-8"));
-      records = Array.isArray(parsed) ? parsed : [];
-    } catch (_) {
-      records = [];
-    }
-  }
-
-  records.push(record);
-  fs.writeFileSync(outputPath, JSON.stringify(records, null, 2) + "\n");
-}
-
-module.exports = { formatCaseSummary, appendArtifactRecord };
diff --git a/tests/llm-integration/lib/reporting.test.cjs b/tests/llm-integration/lib/reporting.test.cjs
deleted file mode 100644
index 875f0e4e..00000000
--- a/tests/llm-integration/lib/reporting.test.cjs
+++ /dev/null
@@ -1,65 +0,0 @@
-const test = require("node:test");
-const nodeAssert = require("node:assert/strict");
-const fs = require("node:fs");
-const path = require("node:path");
-const os = require("node:os");
-
-const { formatCaseSummary, appendArtifactRecord } = require("./reporting.cjs");
-
-test("formatCaseSummary produces expected multi-line output", () => {
-  const summary = formatCaseSummary(
-    { id: "frame-static-basic#specific", category: "pipeline", variant: "specific" },
-    {
-      enabledSkillIds: ["frame", "opensees-static"],
-      activatedSkillIds: ["frame"],
-      structuralSkillId: "frame",
-      analysisSkillId: "opensees-static",
-      toolCalls: [{ tool: "build_model", status: "success" }, { tool: "run_analysis", status: "success" }]
-    },
-    "PASS"
-  );
-
-  nodeAssert.ok(summary.includes("PASS frame-static-basic#specific"));
-  nodeAssert.ok(summary.includes("[pipeline/specific]"));
-  nodeAssert.ok(summary.includes("enabled: frame, opensees-static"));
-  nodeAssert.ok(summary.includes("structural: frame"));
-  nodeAssert.ok(summary.includes("tools: build_model -> run_analysis"));
-});
-
-test("formatCaseSummary handles empty tool calls", () => {
-  const summary = formatCaseSummary(
-    { id: "test", category: "extraction", variant: "legacy" },
-    { enabledSkillIds: undefined, activatedSkillIds: [], toolCalls: [] },
-    "PASS"
-  );
-
-  nodeAssert.ok(summary.includes("enabled: (auto)"));
-  nodeAssert.ok(summary.includes("tools: (none)"));
-});
-
-test("appendArtifactRecord writes and appends records", () => {
-  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "reporting-test-"));
-  const filePath = path.join(tmpDir, "output.json");
-
-  appendArtifactRecord(filePath, { id: "case-1", status: "PASS" });
-  appendArtifactRecord(filePath, { id: "case-2", status: "FAIL" });
-
-  const records = JSON.parse(fs.readFileSync(filePath, "utf-8"));
-  nodeAssert.equal(records.length, 2);
-  nodeAssert.equal(records[0].id, "case-1");
-  nodeAssert.equal(records[1].status, "FAIL");
-
-  fs.rmSync(tmpDir, { recursive: true });
-});
-
-test("appendArtifactRecord creates parent directories", () => {
-  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "reporting-test-"));
-  const filePath = path.join(tmpDir, "nested", "dir", "output.json");
-
-  appendArtifactRecord(filePath, { id: "case-1", status: "PASS" });
-
-  const records = JSON.parse(fs.readFileSync(filePath, "utf-8"));
-  nodeAssert.equal(records.length, 1);
-
-  fs.rmSync(tmpDir, { recursive: true });
-});
diff --git a/tests/llm-integration/lib/retry.js b/tests/llm-integration/lib/retry.js
deleted file mode 100644
index 3464354e..00000000
--- a/tests/llm-integration/lib/retry.js
+++ /dev/null
@@ -1,47 +0,0 @@
-const MAX_ATTEMPTS = 3; // 1 initial + 2 retries
-
-function shouldRetryError(err) {
-  const message = err instanceof Error ? err.message : String(err || "");
-
-  if (!message) {
-    return false;
-  }
-
-  return (
-    /\b(408|409|425|429)\b/u.test(message)
-    || /\b5\d{2}\b/u.test(message)
-    || /rate limit/iu.test(message)
-    || /quota exceeded/iu.test(message)
-    || /temporarily unavailable/iu.test(message)
-    || /overloaded/iu.test(message)
-    || /timeout/iu.test(message)
-    || /timed out/iu.test(message)
-    || /ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN/u.test(message)
-    || /socket hang up/iu.test(message)
-  );
-}
-
-/**
- * Retry an async function up to MAX_ATTEMPTS times.
- * By default only transient upstream failures retry; runner-level callers can
- * opt into retrying every case failure to absorb LLM output drift.
- */
-async function withRetry(fn, label = "test", maxAttempts = MAX_ATTEMPTS, options = {}) {
-  const retryOnAnyError = options.retryOnAnyError === true;
-
-  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
-    try {
-      return await fn();
-    } catch (err) {
-      if (attempt === maxAttempts || (!retryOnAnyError && !shouldRetryError(err))) {
-        throw err;
-      }
-      const msg = err instanceof Error ? err.message : String(err);
-      process.stdout.write(
-        `  [RETRY] ${label} (attempt ${attempt}/${maxAttempts}) — ${msg}\n`
-      );
-    }
-  }
-}
-
-module.exports = { withRetry, MAX_ATTEMPTS, shouldRetryError };
diff --git a/tests/llm-integration/lib/retry.test.cjs b/tests/llm-integration/lib/retry.test.cjs
deleted file mode 100644
index 03dc4d78..00000000
--- a/tests/llm-integration/lib/retry.test.cjs
+++ /dev/null
@@ -1,47 +0,0 @@
-const test = require("node:test");
-const nodeAssert = require("node:assert/strict");
-
-const { withRetry } = require("./retry.js");
-
-test("withRetry does not retry deterministic assertion failures", async () => {
-  let attempts = 0;
-
-  await nodeAssert.rejects(async () => {
-    await withRetry(async () => {
-      attempts += 1;
-      throw new Error('expected tool "run_code_check" in successful tool calls');
-    }, "pipeline-case", 8);
-  }, /run_code_check/);
-
-  nodeAssert.equal(attempts, 1);
-});
-
-test("withRetry retries transient upstream failures", async () => {
-  let attempts = 0;
-
-  const result = await withRetry(async () => {
-    attempts += 1;
-    if (attempts < 3) {
-      throw new Error("429 rate limited");
-    }
-    return "ok";
-  }, "upstream-case", 8);
-
-  nodeAssert.equal(result, "ok");
-  nodeAssert.equal(attempts, 3);
-});
-
-test("withRetry can retry deterministic failures when case-level retries are enabled", async () => {
-  let attempts = 0;
-
-  const result = await withRetry(async () => {
-    attempts += 1;
-    if (attempts < 3) {
-      throw new Error('did not expect "frameDimension" in criticalMissing, but it was present');
-    }
-    return "ok";
-  }, "llm-case", 3, { retryOnAnyError: true });
-
-  nodeAssert.equal(result, "ok");
-  nodeAssert.equal(attempts, 3);
-});
diff --git a/tests/llm-integration/lib/selection.cjs b/tests/llm-integration/lib/selection.cjs
deleted file mode 100644
index 33350832..00000000
--- a/tests/llm-integration/lib/selection.cjs
+++ /dev/null
@@ -1,85 +0,0 @@
-function parseLlmIntegrationOptions(args) {
-  let category;
-  let family;
-  let skillId;
-  let variant;
-  let scenarioId;
-  let outputPath;
-
-  for (let index = 0; index < args.length; index += 1) {
-    const current = args[index];
-
-    // Handle --key=value form
-    const eqIndex = current.indexOf("=");
-    if (current.startsWith("--") && eqIndex > 2) {
-      const key = current.slice(0, eqIndex);
-      const value = current.slice(eqIndex + 1);
-
-      if (key === "--family" || key === "--skill") {
-        family = value;
-        skillId = value;
-      } else if (key === "--variant") {
-        if (value !== "auto") variant = value;
-      } else if (key === "--scenario") {
-        scenarioId = value;
-      } else if (key === "--output") {
-        outputPath = value;
-      }
-      continue;
-    }
-
-    if (current === "--family" || current === "--skill") {
-      family = args[index + 1];
-      skillId = args[index + 1];
-      index += 1;
-      continue;
-    }
-    if (current === "--variant") {
-      const raw = args[index + 1];
-      if (raw !== "auto") variant = raw;
-      index += 1;
-      continue;
-    }
-    if (current === "--scenario") {
-      scenarioId = args[index + 1];
-      index += 1;
-      continue;
-    }
-    if (current === "--output") {
-      outputPath = args[index + 1];
-      index += 1;
-      continue;
-    }
-    if (!current.startsWith("--") && category === undefined) {
-      category = current;
-    }
-  }
-
-  return { category, family, skillId, variant, scenarioId, outputPath };
-}
-
-function filterLlmTestCases(testCases, options = {}) {
-  return testCases.filter((testCase) => {
-    if (options.category && testCase.category !== options.category) {
-      return false;
-    }
-    if (options.skillId && testCase.skillId !== options.skillId) {
-      return false;
-    }
-    if (options.family && testCase.family !== options.family) {
-      return false;
-    }
-    if (options.variant && testCase.variant !== options.variant) {
-      return false;
-    }
-    if (options.scenarioId && testCase.scenarioId !== options.scenarioId) {
-      return false;
-    }
-    return true;
-  });
-}
-
-module.exports = {
-  parseLlmIntegrationOptions,
-  filterLlmTestCases,
-};
diff --git a/tests/llm-integration/lib/selection.test.cjs b/tests/llm-integration/lib/selection.test.cjs
deleted file mode 100644
index 24542dd6..00000000
--- a/tests/llm-integration/lib/selection.test.cjs
+++ /dev/null
@@ -1,101 +0,0 @@
-const test = require("node:test");
-const nodeAssert = require("node:assert/strict");
-
-const { parseLlmIntegrationOptions, filterLlmTestCases } = require("./selection.cjs");
-
-test("parseLlmIntegrationOptions reads category and skill filters", () => {
-  const options = parseLlmIntegrationOptions(["extraction", "--skill", "frame"]);
-
-  nodeAssert.deepEqual(options, {
-    category: "extraction",
-    family: "frame",
-    skillId: "frame",
-    variant: undefined,
-    scenarioId: undefined,
-    outputPath: undefined,
-  });
-});
-
-test("parseLlmIntegrationOptions defaults filters to undefined", () => {
-  const options = parseLlmIntegrationOptions([]);
-
-  nodeAssert.deepEqual(options, {
-    category: undefined,
-    family: undefined,
-    skillId: undefined,
-    variant: undefined,
-    scenarioId: undefined,
-    outputPath: undefined,
-  });
-});
-
-test("filterLlmTestCases narrows by category and skillId", () => {
-  const cases = [
-    { id: "frame-extraction", category: "extraction", skillId: "frame" },
-    { id: "frame-pipeline", category: "pipeline", skillId: "frame" },
-    { id: "beam-extraction", category: "extraction", skillId: "beam" },
-  ];
-
-  const filtered = filterLlmTestCases(cases, {
-    category: "extraction",
-    skillId: "frame",
-  });
-
-  nodeAssert.deepEqual(filtered.map((item) => item.id), ["frame-extraction"]);
-});
-
-test("parseLlmIntegrationOptions reads family, variant, scenario and output filters", () => {
-  const options = parseLlmIntegrationOptions([
-    "pipeline",
-    "--family", "frame",
-    "--variant", "specific",
-    "--scenario", "frame-static-basic",
-    "--output", "tests/.artifacts/frame.json"
-  ]);
-
-  nodeAssert.deepEqual(options, {
-    category: "pipeline",
-    family: "frame",
-    skillId: "frame",
-    variant: "specific",
-    scenarioId: "frame-static-basic",
-    outputPath: "tests/.artifacts/frame.json"
-  });
-});
-
-test("filterLlmTestCases narrows by family, variant and scenarioId", () => {
-  const cases = [
-    { id: "frame-static-basic#specific", family: "frame", variant: "specific", scenarioId: "frame-static-basic", category: "pipeline" },
-    { id: "frame-static-basic#generic", family: "frame", variant: "generic", scenarioId: "frame-static-basic", category: "pipeline" },
-    { id: "beam-basic#specific", family: "beam", variant: "specific", scenarioId: "beam-basic", category: "pipeline" }
-  ];
-
-  const filtered = filterLlmTestCases(cases, {
-    category: "pipeline",
-    family: "frame",
-    variant: "generic",
-    scenarioId: "frame-static-basic"
-  });
-
-  nodeAssert.deepEqual(filtered.map((item) => item.id), ["frame-static-basic#generic"]);
-});
-
-test("parseLlmIntegrationOptions handles --key=value form", () => {
-  const options = parseLlmIntegrationOptions(["--family=beam", "--variant=specific"]);
-
-  nodeAssert.equal(options.family, "beam");
-  nodeAssert.equal(options.skillId, "beam");
-  nodeAssert.equal(options.variant, "specific");
-});
-
-test("parseLlmIntegrationOptions treats --variant auto as no filter", () => {
-  const options = parseLlmIntegrationOptions(["--variant", "auto"]);
-
-  nodeAssert.equal(options.variant, undefined);
-});
-
-test("parseLlmIntegrationOptions treats --variant=auto as no filter", () => {
-  const options = parseLlmIntegrationOptions(["--variant=auto"]);
-
-  nodeAssert.equal(options.variant, undefined);
-});
diff --git a/tests/llm-integration/lib/server.js b/tests/llm-integration/lib/server.js
deleted file mode 100644
index d3e7c7c5..00000000
--- a/tests/llm-integration/lib/server.js
+++ /dev/null
@@ -1,48 +0,0 @@
-const { createRequire } = require("node:module");
-const path = require("node:path");
-const { pathToFileURL } = require("node:url");
-
-/**
- * Create a lightweight Fastify server with real backend routes for integration tests.
- * Uses a temporary SQLite database and real LLM configuration.
- *
- * @param {object} context - Integration context with env vars
- * @returns {Promise<{ app: import('fastify').FastifyInstance, close: () => Promise<void> }>}
- */
-async function createTestServer(context) {
-  const backendRequire = createRequire(
-    path.join(context.rootDir, "backend", "package.json")
-  );
-  const Fastify = backendRequire("fastify");
-
-  const app = Fastify({ bodyLimit: 20 * 1024 * 1024 });
-
-  // Apply env vars so the backend config module picks them up
-  for (const [key, value] of Object.entries(context.env)) {
-    if (value !== undefined && value !== "") {
-      process.env[key] = value;
-    }
-  }
-
-  // Register real backend routes
-  const { agentRoutes } = await import(
-    pathToFileURL(path.join(context.rootDir, "backend", "dist", "api", "agent.js")).href
-  );
-  const { chatRoutes } = await import(
-    pathToFileURL(path.join(context.rootDir, "backend", "dist", "api", "chat.js")).href
-  );
-
-  await app.register(agentRoutes, { prefix: "/api/v1/agent" });
-  await app.register(chatRoutes, { prefix: "/api/v1/chat" });
-
-  await app.ready();
-
-  return {
-    app,
-    async close() {
-      await app.close();
-    },
-  };
-}
-
-module.exports = { createTestServer };
diff --git a/tests/llm-integration/lib/summarize.test.cjs b/tests/llm-integration/lib/summarize.test.cjs
deleted file mode 100644
index 0ff8722e..00000000
--- a/tests/llm-integration/lib/summarize.test.cjs
+++ /dev/null
@@ -1,29 +0,0 @@
-const test = require("node:test");
-const nodeAssert = require("node:assert/strict");
-
-const { summarizeArtifacts } = require("../summarize.cjs");
-
-test("summarizeArtifacts groups pass rate by family and variant", () => {
-  const summary = summarizeArtifacts([
-    { family: "frame", variant: "specific", status: "PASS" },
-    { family: "frame", variant: "specific", status: "FAIL" },
-    { family: "frame", variant: "generic", status: "PASS" }
-  ]);
-
-  nodeAssert.deepEqual(summary.frame.specific, { passed: 1, failed: 1, total: 2 });
-  nodeAssert.deepEqual(summary.frame.generic, { passed: 1, failed: 0, total: 1 });
-});
-
-test("summarizeArtifacts handles missing family/variant", () => {
-  const summary = summarizeArtifacts([
-    { status: "PASS" }
-  ]);
-
-  nodeAssert.ok(summary.unknown);
-  nodeAssert.deepEqual(summary.unknown.unknown, { passed: 1, failed: 0, total: 1 });
-});
-
-test("summarizeArtifacts handles empty records", () => {
-  const summary = summarizeArtifacts([]);
-  nodeAssert.deepEqual(summary, {});
-});
diff --git a/tests/llm-integration/lib/trace.cjs b/tests/llm-integration/lib/trace.cjs
deleted file mode 100644
index 2e463e39..00000000
--- a/tests/llm-integration/lib/trace.cjs
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Resolve the observed trace from test execution results.
- * Normalizes draft and pipeline observations into a unified structure.
- */
-function resolveObservedTrace({ testCase, draftResult, pipelineResult }) {
-  if (pipelineResult) {
-    return {
-      enabledSkillIds: testCase.enabledSkillIds,
-      selectedSkillIds: pipelineResult.routing?.selectedSkillIds || testCase.enabledSkillIds || [],
-      activatedSkillIds: pipelineResult.routing?.activatedSkillIds || [],
-      structuralSkillId: pipelineResult.routing?.structuralSkillId,
-      analysisSkillId: pipelineResult.routing?.analysisSkillId,
-      toolCalls: pipelineResult.toolCalls || []
-    };
-  }
-  return {
-    enabledSkillIds: testCase.enabledSkillIds,
-    selectedSkillIds: testCase.enabledSkillIds || [],
-    activatedSkillIds: [],
-    structuralSkillId: draftResult?.structuralTypeMatch?.skillId || draftResult?.stateToPersist?.skillId,
-    analysisSkillId: undefined,
-    toolCalls: []
-  };
-}
-
-module.exports = { resolveObservedTrace };
diff --git a/tests/llm-integration/lib/trace.test.cjs b/tests/llm-integration/lib/trace.test.cjs
deleted file mode 100644
index de497a34..00000000
--- a/tests/llm-integration/lib/trace.test.cjs
+++ /dev/null
@@ -1,50 +0,0 @@
-const test = require("node:test");
-const nodeAssert = require("node:assert/strict");
-
-const { resolveObservedTrace } = require("./trace.cjs");
-
-test("resolveObservedTrace normalizes draft observations", () => {
-  const trace = resolveObservedTrace({
-    testCase: { enabledSkillIds: ["frame", "opensees-static"], fallbackPolicy: "forbid-generic" },
-    draftResult: { structuralTypeMatch: { skillId: "frame" }, inferredType: "frame", extractionMode: "llm" }
-  });
-
-  nodeAssert.equal(trace.structuralSkillId, "frame");
-  nodeAssert.deepEqual(trace.enabledSkillIds, ["frame", "opensees-static"]);
-  nodeAssert.deepEqual(trace.selectedSkillIds, ["frame", "opensees-static"]);
-  nodeAssert.equal(trace.analysisSkillId, undefined);
-  nodeAssert.deepEqual(trace.toolCalls, []);
-});
-
-test("resolveObservedTrace normalizes pipeline observations", () => {
-  const trace = resolveObservedTrace({
-    testCase: { enabledSkillIds: ["frame", "opensees-static"] },
-    pipelineResult: {
-      routing: {
-        selectedSkillIds: ["frame", "opensees-static"],
-        activatedSkillIds: ["frame", "opensees-static", "validation-structure-model"],
-        structuralSkillId: "frame",
-        analysisSkillId: "opensees-static"
-      },
-      toolCalls: [
-        { tool: "build_model", status: "success", authorizedBySkillIds: ["frame"] },
-        { tool: "run_analysis", status: "success", authorizedBySkillIds: ["opensees-static"] }
-      ]
-    }
-  });
-
-  nodeAssert.equal(trace.structuralSkillId, "frame");
-  nodeAssert.equal(trace.analysisSkillId, "opensees-static");
-  nodeAssert.deepEqual(trace.activatedSkillIds, ["frame", "opensees-static", "validation-structure-model"]);
-  nodeAssert.equal(trace.toolCalls.length, 2);
-});
-
-test("resolveObservedTrace handles missing draft result", () => {
-  const trace = resolveObservedTrace({
-    testCase: { enabledSkillIds: undefined }
-  });
-
-  nodeAssert.equal(trace.structuralSkillId, undefined);
-  nodeAssert.deepEqual(trace.enabledSkillIds, undefined);
-  nodeAssert.deepEqual(trace.selectedSkillIds, []);
-});
diff --git a/tests/llm-integration/runner.cjs b/tests/llm-integration/runner.cjs
deleted file mode 100644
index 232ae18d..00000000
--- a/tests/llm-integration/runner.cjs
+++ /dev/null
@@ -1,224 +0,0 @@
-const path = require("node:path");
-const { pathToFileURL } = require("node:url");
-
-const { resolveIntegrationContext } = require("./lib/context.js");
-const { createRealLlmClient } = require("./lib/real-llm-client.cjs");
-const { withRetry, MAX_ATTEMPTS } = require("./lib/retry.js");
-const { loadLlmFixtures } = require("./lib/discovery.cjs");
-const { parseLlmIntegrationOptions, filterLlmTestCases } = require("./lib/selection.cjs");
-const {
-  assertRoutingTrace,
-  assertToolAuthorizers,
-} = require("./lib/assertions.js");
-const {
-  runRoutingTest,
-  runExtractionTest,
-  runPipelineTest,
-  runClarificationTest,
-} = require("./lib/executors.cjs");
-const { resolveObservedTrace } = require("./lib/trace.cjs");
-const { formatCaseSummary, appendArtifactRecord } = require("./lib/reporting.cjs");
-
-/** Import AgentSkillRuntime from backend dist. */
-async function importAgentSkillRuntime(rootDir) {
-  const filePath = path.join(rootDir, "backend", "dist", "agent-runtime", "index.js");
-  const mod = await import(pathToFileURL(filePath).href);
-  return mod.AgentSkillRuntime;
-}
-
-/** Import and instantiate LangGraphAgentService with real LLM. */
-async function createAgentService(rootDir, skillRuntime) {
-  const filePath = path.join(rootDir, "backend", "dist", "agent-langgraph", "agent-service.js");
-  const mod = await import(`${pathToFileURL(filePath).href}?llm-test=${Date.now()}`);
-  const LangGraphAgentService = mod.LangGraphAgentService;
-  return new LangGraphAgentService(skillRuntime);
-}
-
-// ---------------------------------------------------------------------------
-// Main runner
-// ---------------------------------------------------------------------------
-
-async function runLlmIntegrationTests(rootDir, args) {
-  const maxAttempts = MAX_ATTEMPTS;
-  const context = resolveIntegrationContext(rootDir);
-  const options = parseLlmIntegrationOptions(args);
-
-  // Inject LLM env vars into process.env BEFORE importing backend modules.
-  // The backend config module reads process.env at import time.
-  for (const [k, v] of Object.entries(context.env)) {
-    if (v !== undefined && v !== "") {
-      process.env[k] = v;
-    }
-  }
-
-  // Ensure backend is built
-  const { runBackendBuildOnce } = require("../regression/shared.js");
-  await runBackendBuildOnce(context);
-
-  // Ensure DB is ready
-  const { execSync } = require("node:child_process");
-  execSync("npx prisma db push --accept-data-loss", {
-    cwd: path.join(rootDir, "backend"),
-    env: { ...process.env, ...context.env },
-    stdio: "pipe",
-  });
-
-  // Load test cases — default to routing-only since extraction/pipeline/clarification
-  // categories depend on the legacy AgentService API (textToModelDraft etc.)
-  // which no longer exists under the LangGraph ReAct architecture.
-  const effectiveCategory = options.category || "routing";
-  const allCases = loadLlmFixtures(rootDir);
-  const cases = filterLlmTestCases(allCases, { ...options, category: effectiveCategory });
-
-  if (cases.length === 0) {
-    process.stdout.write("No test cases matched.\n");
-    return;
-  }
-
-  process.stdout.write(`\n${"=".repeat(60)}\n`);
-  process.stdout.write(`LLM Integration Tests: ${cases.length} cases\n`);
-  process.stdout.write(`Model: ${context.env.LLM_MODEL || "(default)"}\n`);
-  process.stdout.write(`Base URL: ${context.env.LLM_BASE_URL || "(default)"}\n`);
-  process.stdout.write(`Category: ${options.category || "(all)"}\n`);
-  process.stdout.write(`Skill: ${options.skillId || "(all)"}\n`);
-  process.stdout.write(`Family: ${options.family || "(all)"}\n`);
-  process.stdout.write(`Variant: ${options.variant || "(all)"}\n`);
-  process.stdout.write(`${"=".repeat(60)}\n\n`);
-
-  // Create LLM client and runtime
-  const llm = createRealLlmClient(context, 0);
-  const AgentSkillRuntime = await importAgentSkillRuntime(rootDir);
-  const runtime = new AgentSkillRuntime();
-
-  let agentService = null;
-
-  const results = { passed: 0, failed: 0, retried: 0, failures: [] };
-  const startTime = Date.now();
-
-  for (const testCase of cases) {
-    const caseStart = Date.now();
-    let draftResult = null;
-    let pipelineResult = null;
-
-    try {
-      await withRetry(async () => {
-        switch (testCase.category) {
-          case "routing":
-            await runRoutingTest(runtime, testCase);
-            break;
-          case "extraction":
-            if (!llm) throw new Error("LLM client not available");
-            draftResult = await runExtractionTest(runtime, llm, testCase);
-            break;
-          case "pipeline": {
-            if (!agentService) {
-              agentService = await createAgentService(rootDir, runtime);
-            }
-            pipelineResult = await runPipelineTest(agentService, testCase);
-            break;
-          }
-          case "clarification":
-            if (!llm) throw new Error("LLM client not available");
-            draftResult = await runClarificationTest(runtime, llm, testCase);
-            break;
-          default:
-            throw new Error(`Unknown test category: ${testCase.category}`);
-        }
-      }, testCase.id, maxAttempts, { retryOnAnyError: true });
-
-      // Resolve observed trace
-      const observedTrace = resolveObservedTrace({
-        testCase,
-        draftResult: draftResult || undefined,
-        pipelineResult: pipelineResult || undefined,
-      });
-
-      // Assert routing trace expectations
-      const expect = testCase.expect || {};
-      if (expect.routing) {
-        assertRoutingTrace(observedTrace, expect.routing);
-      }
-      if (expect.toolAuthorizers) {
-        assertToolAuthorizers(observedTrace.toolCalls || [], expect.toolAuthorizers);
-      }
-
-      // Check fallback policy
-      if (testCase.fallbackPolicy === "forbid-generic" && observedTrace.structuralSkillId === "generic") {
-        throw new Error(`unexpected generic fallback for ${testCase.id}`);
-      }
-      if (testCase.fallbackPolicy === "require-generic" && observedTrace.structuralSkillId !== "generic") {
-        throw new Error(`expected generic fallback for ${testCase.id}`);
-      }
-
-      const duration = Date.now() - caseStart;
-      process.stdout.write(`${formatCaseSummary(testCase, observedTrace, "PASS")}\n`);
-
-      if (options.outputPath) {
-        appendArtifactRecord(options.outputPath, {
-          id: testCase.id,
-          category: testCase.category,
-          variant: testCase.variant,
-          family: testCase.family,
-          enabledSkillIds: observedTrace.enabledSkillIds,
-          activatedSkillIds: observedTrace.activatedSkillIds,
-          structuralSkillId: observedTrace.structuralSkillId,
-          analysisSkillId: observedTrace.analysisSkillId,
-          toolCalls: observedTrace.toolCalls,
-          status: "PASS",
-          durationMs: duration,
-        });
-      }
-
-      results.passed += 1;
-    } catch (err) {
-      draftResult = draftResult || err?.draftResult || null;
-      pipelineResult = pipelineResult || err?.pipelineResult || null;
-      const duration = Date.now() - caseStart;
-      const message = err instanceof Error ? err.message : String(err);
-
-      const failTrace = resolveObservedTrace({
-        testCase,
-        draftResult: draftResult || undefined,
-        pipelineResult: pipelineResult || undefined,
-      });
-      process.stdout.write(`${formatCaseSummary(testCase, failTrace, "FAIL")}\n`);
-      process.stdout.write(`    error: ${message}\n`);
-
-      if (options.outputPath) {
-        appendArtifactRecord(options.outputPath, {
-          id: testCase.id,
-          category: testCase.category,
-          variant: testCase.variant,
-          family: testCase.family,
-          enabledSkillIds: failTrace.enabledSkillIds,
-          activatedSkillIds: failTrace.activatedSkillIds,
-          structuralSkillId: failTrace.structuralSkillId,
-          analysisSkillId: failTrace.analysisSkillId,
-          toolCalls: failTrace.toolCalls,
-          status: "FAIL",
-          durationMs: duration,
-          error: message,
-        });
-      }
-
-      results.failed += 1;
-      results.failures.push({ id: testCase.id, error: message });
-    }
-  }
-
-  // Summary
-  const totalDuration = Date.now() - startTime;
-  process.stdout.write(`\n${"=".repeat(60)}\n`);
-  process.stdout.write(`Results: ${results.passed}/${cases.length} passed, ${results.failed} failed\n`);
-  if (results.failures.length > 0) {
-    process.stdout.write(`Failed: ${results.failures.map((f) => f.id).join(", ")}\n`);
-  }
-  process.stdout.write(`Total time: ${(totalDuration / 1000).toFixed(1)}s\n`);
-  process.stdout.write(`${"=".repeat(60)}\n\n`);
-
-  if (results.failed > 0) {
-    process.exitCode = 1;
-  }
-}
-
-module.exports = { runLlmIntegrationTests };
diff --git a/tests/llm-integration/summarize.cjs b/tests/llm-integration/summarize.cjs
deleted file mode 100644
index 7ccc99c3..00000000
--- a/tests/llm-integration/summarize.cjs
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Summarize artifact records by grouping pass/fail rate by family and variant.
- */
-function summarizeArtifacts(records) {
-  return records.reduce((acc, record) => {
-    const family = record.family || "unknown";
-    const variant = record.variant || "unknown";
-    acc[family] = acc[family] || {};
-    acc[family][variant] = acc[family][variant] || { passed: 0, failed: 0, total: 0 };
-    const bucket = acc[family][variant];
-    bucket.total += 1;
-    if (record.status === "PASS") bucket.passed += 1;
-    else bucket.failed += 1;
-    return acc;
-  }, {});
-}
-
-/**
- * Print a summary table to stdout.
- */
-function printSummary(summary) {
-  for (const [family, variants] of Object.entries(summary)) {
-    process.stdout.write(`\n${family}:\n`);
-    for (const [variant, stats] of Object.entries(variants)) {
-      process.stdout.write(`  ${variant}: passed=${stats.passed}, failed=${stats.failed}, total=${stats.total}\n`);
-    }
-  }
-}
-
-module.exports = { summarizeArtifacts, printSummary };
diff --git a/tests/regression/backend-regression.js b/tests/regression/backend-regression.js
index a48c6195..d9be58c5 100644
--- a/tests/regression/backend-regression.js
+++ b/tests/regression/backend-regression.js
@@ -20,6 +20,7 @@ const BACKEND_STEPS = [
   ["Chat stream contract regression", "validate-chat-stream-contract"],
   ["Chat message routing contract", "validate-chat-message-routing"],
   ["Report narrative contract", "validate-report-narrative-contract"],
+  ["Skill routing regression", "validate-skill-routing"],
 ];
 
 const JEST_ENV_FORWARD_KEYS = [
diff --git a/tests/regression/backend-validations.js b/tests/regression/backend-validations.js
index 239fd24a..833745ca 100644
--- a/tests/regression/backend-validations.js
+++ b/tests/regression/backend-validations.js
@@ -1559,6 +1559,42 @@ async function validateStructureJsonSkill(context) {
   console.log("[ok] validation module runtime exports");
 }
 
+async function validateSkillRouting(context) {
+  const { loadLlmFixtures } = require("../llm-integration/lib/discovery.cjs");
+  const AgentSkillRuntime = await importAgentSkillRuntime(context.rootDir);
+  const runtime = new AgentSkillRuntime();
+
+  const allCases = loadLlmFixtures(context.rootDir);
+  const routingCases = allCases.filter((c) => c.category === "routing");
+
+  assert(routingCases.length > 0, "should have at least one routing fixture case");
+
+  let passed = 0;
+  for (const testCase of routingCases) {
+    const locale = testCase.locale === "zh" ? "zh" : "en";
+    const message = testCase.messages[0];
+    const match = await runtime.detectStructuralType(message, locale, undefined, testCase.enabledSkillIds);
+    const expected = testCase.expect || {};
+
+    if (expected.inferredType) {
+      const actualKey = match.mappedType || match.key;
+      assert(
+        actualKey === expected.inferredType || match.skillId === expected.inferredType,
+        `[${testCase.id}] expected inferredType="${expected.inferredType}", got key="${match.key}" mappedType="${match.mappedType}" skillId="${match.skillId}"`
+      );
+    }
+    if (expected.structuralTypeKey) {
+      assert(
+        match.key === expected.structuralTypeKey || match.mappedType === expected.structuralTypeKey,
+        `[${testCase.id}] expected structuralTypeKey="${expected.structuralTypeKey}", got key="${match.key}" mappedType="${match.mappedType}"`
+      );
+    }
+    passed += 1;
+  }
+
+  console.log(`[ok] skill routing: ${passed}/${routingCases.length} cases passed`);
+}
+
 const BACKEND_VALIDATIONS = {
   "validate-agent-orchestration": validateAgentOrchestration,
   "validate-agent-base-chat-fallback": validateAgentBaseChatFallback,
@@ -1580,6 +1616,7 @@ const BACKEND_VALIDATIONS = {
   "validate-report-narrative-contract": validateReportNarrativeContract,
   "validate-dev-startup-guards": validateDevStartupGuards,
   "validate-structure-json-skill": validateStructureJsonSkill,
+  "validate-skill-routing": validateSkillRouting,
 };
 
 async function runBackendValidation(name, context) {
diff --git a/tests/runner.mjs b/tests/runner.mjs
index 2197db55..1e807f0b 100644
--- a/tests/runner.mjs
+++ b/tests/runner.mjs
@@ -11,8 +11,6 @@ const { runBackendRegression } = require("./regression/backend-regression.js");
 const { runAnalysisRegression } = require("./regression/analysis-regression.js");
 const { runNativeInstallSmoke } = require("./smoke/install-smoke.cjs");
 
-const { runLlmIntegrationTests } = require("./llm-integration/runner.cjs");
-const { summarizeArtifacts, printSummary } = require("./llm-integration/summarize.cjs");
 const { runBenchmark } = require("./llm-benchmark/runner.cjs");
 
 function parseCliOptions(args) {
@@ -73,22 +71,15 @@ Commands:
   validate --list       List named validations
   check <name>          Run a grouped validation alias
   check --list          List grouped validation aliases
-  backend-regression    Backend regression bundle: build, lint, Jest, and validations
+  backend-regression    Backend regression bundle: build, lint, Jest, and validations (includes skill routing)
   analysis-regression   Deterministic engineering analysis regression
-  llm-integration       Legacy LLM/routing integration tests (requires LLM_API_KEY)
-                        supports: node tests/runner.mjs llm-integration [category]
-                        default: routing
-                        categories: routing | extraction | pipeline | clarification
-                          [--family <family>]  (alias: --skill)
-                          [--variant <specific|generic|auto>]
-                          [--scenario <scenarioId>]
-                          [--output <artifact.json>]
-  llm-benchmark         LangGraph agent benchmark (requires LLM_API_KEY)
-                        runs the full ReAct agent and evaluates scenario quality
+  llm-benchmark         LangGraph agent benchmark with v2 assertions (requires LLM_API_KEY)
+                        runs full ReAct agent end-to-end with skill-hit tracing and LLM-as-Judge
+                        assertion types: structural_type | has_model | has_analysis | has_report |
+                                         skill_match | natural_language
                           [--scenario <scenarioId>]
                           [--output <results.json>]
-  llm-summary <path>   Summarize LLM test artifacts by family/variant
-  smoke-native          Native install/build compatibility smoke
+  smoke-native          CI-style native install smoke (npm ci + build)
 
 Replaces former sclaw commands:
   sclaw validate ...    -> node tests/runner.mjs validate ...
@@ -150,31 +141,9 @@ async function main() {
     case "smoke-native":
       await runNativeInstallSmoke(rootDir);
       return;
-    case "llm-integration":
-      await runLlmIntegrationTests(rootDir, rawArgs);
-      return;
     case "llm-benchmark":
       await runBenchmark(rootDir, rawArgs);
       return;
-    case "llm-summary": {
-      const artifactPath = rawArgs[0];
-      if (!artifactPath) {
-        throw new Error("Usage: node tests/runner.mjs llm-summary <artifact.json>");
-      }
-      const fs = require("node:fs");
-      if (!fs.existsSync(artifactPath)) {
-        throw new Error(`Artifact file not found: ${artifactPath}`);
-      }
-      const parsed = JSON.parse(fs.readFileSync(artifactPath, "utf-8"));
-      if (!Array.isArray(parsed)) {
-        throw new Error(`Expected a JSON array in ${artifactPath}, got ${typeof parsed}`);
-      }
-      const records = parsed;
-      const summary = summarizeArtifacts(records);
-      printSummary(summary);
-      process.stdout.write("\n");
-      return;
-    }
     default:
       throw new Error(`Unknown command: ${cmd}`);
   }