diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 4307ae44..ee29d1e4 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -39,23 +39,27 @@ jobs: steps: - name: Get PR Reference id: get-ref - run: | - set -euo pipefail - if [ "${{ github.event_name }}" == "issue_comment" ]; then - PR_URL="${{ github.event.issue.pull_request.url }}" - SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha) - if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then - echo "::error::Failed to resolve PR head SHA" - exit 1 - fi - echo "sha=$SHA" >> "$GITHUB_OUTPUT" - echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT" - echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT" - else - echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT" - echo "issue_number=" >> "$GITHUB_OUTPUT" - echo "comment_user=" >> "$GITHUB_OUTPUT" - fi + uses: actions/github-script@v9 + with: + script: | + const eventName = context.eventName; + if (eventName === 'issue_comment') { + const [owner, repo] = context.payload.repository.full_name.split('/'); + const prNumber = context.payload.issue.number; + const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber }); + const sha = pr.head?.sha; + if (!sha) { + core.setFailed('Failed to resolve PR head SHA'); + return; + } + core.setOutput('sha', sha); + core.setOutput('issue_number', String(context.issue.number)); + core.setOutput('comment_user', context.payload.comment.user.login); + } else { + core.setOutput('sha', '${{ github.sha }}'); + core.setOutput('issue_number', ''); + core.setOutput('comment_user', ''); + } - name: Reply with Action Link if: github.event_name == 'issue_comment' @@ -176,15 +180,20 @@ jobs: if: always() && needs.check-permission.outputs.trigger_issue_number != '' continue-on-error: true uses: actions/github-script@v9 + env: + ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }} + COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }} + TEST_OUTCOME: ${{ steps.run-tests.outcome }} + SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }} with: github-token: ${{ secrets.COMMENT_TOKEN }} script: | - const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}'); - const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}'; - const success = '${{ steps.run-tests.outcome }}' === 'success'; + const issueNumber = Number(process.env.ISSUE_NUMBER); + const commentUser = process.env.COMMENT_USER; + const success = process.env.TEST_OUTCOME === 'success'; + const summary = JSON.parse(process.env.SUMMARY_JSON || '""'); const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; - const summary = ${{ toJson(steps.summary.outputs.summary) }}; const icon = success ? '✅' : '❌'; const status = success ? 'PASSED' : 'FAILED'; const body = [ diff --git a/.github/workflows/llm-integration.yml b/.github/workflows/llm-benchmark.yml similarity index 56% rename from .github/workflows/llm-integration.yml rename to .github/workflows/llm-benchmark.yml index dbc07453..89deab51 100644 --- a/.github/workflows/llm-integration.yml +++ b/.github/workflows/llm-benchmark.yml @@ -1,4 +1,4 @@ -name: LLM Integration Tests +name: LLM Benchmark on: workflow_dispatch: @@ -7,24 +7,12 @@ on: description: "LLM model name (leave empty for default)" required: false default: "" - filter: - description: "Test category filter (routing|extraction|pipeline|clarification)" + scenario: + description: "Run a single scenario by ID (leave empty for all)" required: false default: "" issue_comment: types: [created] - push: - branches: - - master - paths: - - 'backend/**' - - 'scripts/**' - - 'tests/**' - - 'sclaw' - - 'sclaw_cn' - - 'package.json' - - '.github/workflows/llm-integration.yml' - permissions: contents: read @@ -36,41 +24,44 @@ jobs: issues: write if: | github.event_name == 'workflow_dispatch' || - github.event_name == 'push' || (github.event.issue.pull_request && - contains(github.event.comment.body, '/test-llm') && + contains(github.event.comment.body, '/test-llm-benchmark') && (github.event.comment.user.login == 'guyi2000' || github.event.comment.user.login == 'qinsz01')) runs-on: ubuntu-latest outputs: pr_sha: ${{ steps.get-ref.outputs.sha }} - filter_arg: ${{ steps.get-ref.outputs.filter_arg }} + scenario_arg: ${{ steps.get-ref.outputs.scenario_arg }} trigger_issue_number: ${{ steps.get-ref.outputs.issue_number }} trigger_comment_user: ${{ steps.get-ref.outputs.comment_user }} steps: - name: Get PR Reference id: get-ref - env: - COMMENT_BODY: ${{ github.event.comment.body }} - run: | - set -euo pipefail - if [ "${{ github.event_name }}" == "issue_comment" ]; then - PR_URL="${{ github.event.issue.pull_request.url }}" - SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha) - if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then - echo "::error::Failed to resolve PR head SHA" - exit 1 - fi - echo "sha=$SHA" >> "$GITHUB_OUTPUT" - FILTER=$(echo "$COMMENT_BODY" | sed -n 's|/test-llm[[:space:]]*\(.*\)|\1|p' | xargs) - echo "filter_arg=$FILTER" >> "$GITHUB_OUTPUT" - echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT" - echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT" - else - echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT" - echo "filter_arg=${{ github.event.inputs.filter }}" >> "$GITHUB_OUTPUT" - echo "issue_number=" >> "$GITHUB_OUTPUT" - echo "comment_user=" >> "$GITHUB_OUTPUT" - fi + uses: actions/github-script@v9 + with: + script: | + const eventName = context.eventName; + if (eventName === 'issue_comment') { + const [owner, repo] = context.payload.repository.full_name.split('/'); + const prNumber = context.payload.issue.number; + const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber }); + const sha = pr.head?.sha; + if (!sha) { + core.setFailed('Failed to resolve PR head SHA'); + return; + } + const body = context.payload.comment.body || ''; + const match = body.match(/\/test-llm-benchmark\s*([a-zA-Z0-9_-]*)/); + const scenario = match ? match[1] : ''; + core.setOutput('sha', sha); + core.setOutput('scenario_arg', scenario); + core.setOutput('issue_number', String(context.issue.number)); + core.setOutput('comment_user', context.payload.comment.user.login); + } else { + core.setOutput('sha', '${{ github.sha }}'); + core.setOutput('scenario_arg', '${{ github.event.inputs.scenario }}'); + core.setOutput('issue_number', ''); + core.setOutput('comment_user', ''); + } - name: Reply with Action Link if: github.event_name == 'issue_comment' @@ -81,9 +72,9 @@ jobs: script: | const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; const body = [ - `✅ **LLM Integration Tests Triggered!**`, + `✅ **LLM Benchmark Triggered!**`, ``, - `@${context.payload.comment.user.login}, I've started the workflow for you.`, + `@${context.payload.comment.user.login}, I've started the benchmark run.`, `Please click the link below to monitor progress and **approve the deployment** to the test environment:`, ``, `🚀 **[View Action Run Details](${runUrl})**`, @@ -96,7 +87,7 @@ jobs: body: body }) - llm-integration: + llm-benchmark: needs: check-permission permissions: contents: read @@ -104,9 +95,9 @@ jobs: environment: test runs-on: ubuntu-latest concurrency: - group: llm-integration-${{ github.event.issue.number || github.sha }} + group: llm-benchmark-${{ github.event.issue.number || github.sha }} cancel-in-progress: true - timeout-minutes: 60 + timeout-minutes: 90 env: NEXT_TELEMETRY_DISABLED: 1 steps: @@ -133,33 +124,44 @@ jobs: - name: Build via sclaw run: node ./sclaw build - - name: Run LLM integration tests - id: run-tests + - name: Run LLM benchmark + id: run-benchmark env: LLM_API_KEY: ${{ secrets.LLM_API_KEY }} LLM_MODEL: ${{ github.event.inputs.llm_model || vars.LLM_MODEL }} LLM_BASE_URL: ${{ vars.LLM_BASE_URL }} + LLM_JUDGE_API_KEY: ${{ secrets.LLM_JUDGE_API_KEY || secrets.LLM_API_KEY }} + LLM_JUDGE_MODEL: ${{ vars.LLM_JUDGE_MODEL }} LLM_LOG_ENABLED: "true" LLM_LOG_DIR: ${{ github.workspace }}/.structureclaw/logs - DATABASE_URL: "file:.structureclaw/data/structureclaw-llm-ci.db" + DATABASE_URL: "file:.structureclaw/data/structureclaw-benchmark-ci.db" run: | set -o pipefail - FILTER_ARG="${{ needs.check-permission.outputs.filter_arg }}" - if [ -n "$FILTER_ARG" ]; then - node tests/runner.mjs llm-integration --filter "$FILTER_ARG" 2>&1 | tee test-output.txt + SCENARIO_ARG="${{ needs.check-permission.outputs.scenario_arg }}" + if [ -n "$SCENARIO_ARG" ]; then + node tests/runner.mjs llm-benchmark --scenario "$SCENARIO_ARG" --output benchmark-results.json 2>&1 | tee test-output.txt else - node tests/runner.mjs llm-integration 2>&1 | tee test-output.txt + node tests/runner.mjs llm-benchmark --output benchmark-results.json 2>&1 | tee test-output.txt fi + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v7 + with: + name: benchmark-results + path: | + benchmark-results.json + .structureclaw/logs/ + - name: Extract test summary if: always() id: summary run: | if [ ! -f test-output.txt ]; then - echo "summary=Tests did not produce output." >> "$GITHUB_OUTPUT" + echo "summary=Benchmark did not produce output." >> "$GITHUB_OUTPUT" exit 0 fi - SUMMARY=$(grep -B 1 -A 3 '^Results:' test-output.txt || echo "No summary found") + SUMMARY=$(grep -B 1 -A 5 '^Benchmark Results:' test-output.txt || echo "No summary found") echo "summary<> "$GITHUB_OUTPUT" echo "$SUMMARY" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" @@ -168,27 +170,32 @@ jobs: if: always() && needs.check-permission.outputs.trigger_issue_number != '' continue-on-error: true uses: actions/github-script@v9 + env: + ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }} + COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }} + BENCHMARK_OUTCOME: ${{ steps.run-benchmark.outcome }} + SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }} with: github-token: ${{ secrets.COMMENT_TOKEN }} script: | - const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}'); - const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}'; - const success = '${{ steps.run-tests.outcome }}' === 'success'; - const summary = ${{ toJson(steps.summary.outputs.summary) }}; + const issueNumber = Number(process.env.ISSUE_NUMBER); + const commentUser = process.env.COMMENT_USER; + const success = process.env.BENCHMARK_OUTCOME === 'success'; + const summary = JSON.parse(process.env.SUMMARY_JSON || '""'); const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; const icon = success ? '✅' : '❌'; const status = success ? 'PASSED' : 'FAILED'; const body = [ - `${icon} **LLM Integration Tests ${status}**`, + `${icon} **LLM Benchmark ${status}**`, ``, - `@${commentUser}, the test run has completed.`, + `@${commentUser}, the benchmark run has completed.`, ``, `\`\`\``, summary, `\`\`\``, ``, - `📦 **[Download Logs](${runUrl})** or check the Artifacts section.`, + `📦 **[Download Results](${runUrl})** or check the Artifacts section.`, ].join('\n'); await github.rest.issues.createComment({ @@ -197,12 +204,3 @@ jobs: repo: context.repo.repo, body: body }) - - - name: Upload LLM call logs - if: always() - uses: actions/upload-artifact@v7 - with: - name: llm-logs-ubuntu - path: .structureclaw/logs/*.jsonl - retention-days: 7 - if-no-files-found: ignore diff --git a/AGENTS.md b/AGENTS.md index c331e03a..641d924f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -67,7 +67,8 @@ node tests/runner.mjs validate validate-agent-orchestration # Agent orchestrati node tests/runner.mjs validate validate-chat-stream-contract # Chat stream contract node tests/runner.mjs validate validate-analyze-contract # Analyze endpoint contract node tests/runner.mjs smoke-native # Full native install smoke (mirrors CI) -node tests/runner.mjs llm-integration # LLM integration tests (needs LLM_API_KEY) +node tests/runner.mjs llm-benchmark # LLM benchmark: v2 assertions, skill-trace, LLM-as-Judge (needs LLM_API_KEY) +node tests/runner.mjs llm-benchmark --scenario # Run a single benchmark scenario ``` ## Key Conventions diff --git a/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json index 1bbe4762..968cf506 100644 --- a/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json +++ b/backend/src/agent-skills/structure-type/beam/__llm_tests__/cases.json @@ -6,7 +6,9 @@ "scenarioId": "beam-routing-zh", "category": "routing", "locale": "zh", - "messages": ["一根简支梁,跨度6米"], + "messages": [ + "一根简支梁,跨度6米" + ], "variants": { "legacy": { "expect": { @@ -20,7 +22,9 @@ "scenarioId": "beam-routing-en", "category": "routing", "locale": "en", - "messages": ["a simply supported beam, span 6m"], + "messages": [ + "a simply supported beam, span 6m" + ], "variants": { "legacy": { "expect": { @@ -29,274 +33,6 @@ } } } - }, - { - "scenarioId": "beam-params-zh", - "category": "extraction", - "locale": "zh", - "messages": ["一根悬臂梁,长3米,端部集中力10kN"], - "variants": { - "specific": { - "enabledSkillIds": ["beam"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "beam", - "criticalMissing": [], - "draftPatch": { - "lengthM": { - "value": 3, - "tolerance": 0.05 - }, - "supportType": "cantilever", - "loadKN": { - "value": 10, - "tolerance": 0.05 - }, - "loadType": "point" - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "beam-params-en", - "category": "extraction", - "locale": "en", - "messages": ["cantilever beam, 4m long, point load 15kN at tip"], - "variants": { - "specific": { - "enabledSkillIds": ["beam"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "beam", - "criticalMissing": [], - "draftPatch": { - "lengthM": { - "value": 4, - "tolerance": 0.05 - }, - "supportType": "cantilever", - "loadKN": { - "value": 15, - "tolerance": 0.05 - }, - "loadType": "point" - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "beam-distributed-zh", - "category": "extraction", - "locale": "zh", - "messages": ["简支梁6米,均布荷载20kN/m"], - "variants": { - "specific": { - "enabledSkillIds": ["beam"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "beam", - "criticalMissing": [], - "draftPatch": { - "lengthM": { - "value": 6, - "tolerance": 0.05 - }, - "supportType": "simply-supported", - "loadKN": { - "value": 20, - "tolerance": 0.05 - }, - "loadType": "distributed" - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "beam-position-zh", - "category": "extraction", - "locale": "zh", - "messages": ["简支梁8米,在距左端3米处作用集中力25kN"], - "variants": { - "specific": { - "enabledSkillIds": ["beam"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "beam", - "criticalMissing": [], - "draftPatch": { - "lengthM": { - "value": 8, - "tolerance": 0.05 - }, - "loadKN": { - "value": 25, - "tolerance": 0.05 - }, - "loadPositionM": { - "value": 3, - "tolerance": 0.05 - } - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "beam-pipeline-zh", - "category": "pipeline", - "locale": "zh", - "messages": ["简支梁6米,均布荷载20kN/m,请进行静力分析"], - "variants": { - "specific": { - "enabledSkillIds": ["beam", "opensees-static", "validation-structure-model"], - "fallbackPolicy": "forbid-generic", - "expect": { - "routing": { - "structuralSkillId": "beam", - "analysisSkillId": "opensees-static" - }, - "success": true, - "toolCalls": [ - "build_model", - "run_analysis" - ], - "modelBuilt": true, - "analysisSuccess": true - } - }, - "generic": { - "enabledSkillIds": ["generic", "opensees-static"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { - "structuralSkillId": "generic", - "analysisSkillId": "opensees-static" - } - } - } - } - }, - { - "scenarioId": "beam-clarify-zh", - "category": "clarification", - "locale": "zh", - "turns": [ - { - "message": "一根梁", - "assertions": { - "criticalMissingIncludes": [ - "lengthM" - ], - "modelBuilt": false - } - }, - { - "message": "6米", - "assertions": { - "criticalMissingIncludes": [ - "loadKN" - ], - "modelBuilt": false - } - }, - { - "message": "20kN均布荷载", - "assertions": { - "criticalMissing": [], - "modelBuilt": true, - "draftPatch": { - "lengthM": { - "value": 6, - "tolerance": 0.05 - }, - "supportType": "simply-supported", - "loadKN": { - "value": 20, - "tolerance": 0.05 - }, - "loadType": "distributed" - } - } - } - ], - "variants": { - "legacy": { - "expect": {} - } - } - }, - { - "scenarioId": "beam-clarify-revise", - "category": "clarification", - "locale": "zh", - "turns": [ - { - "message": "简支梁6米,均布荷载10kN/m", - "assertions": { - "criticalMissing": [], - "modelBuilt": true, - "draftPatch": { - "lengthM": { - "value": 6, - "tolerance": 0.05 - }, - "loadKN": { - "value": 10, - "tolerance": 0.05 - } - } - } - }, - { - "message": "改成8米", - "assertions": { - "criticalMissing": [], - "modelBuilt": true, - "draftPatch": { - "lengthM": { - "value": 8, - "tolerance": 0.05 - } - } - } - } - ], - "variants": { - "legacy": { - "expect": {} - } - } } ] } diff --git a/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json index 94d1aa74..5ebaf6f9 100644 --- a/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json +++ b/backend/src/agent-skills/structure-type/double-span-beam/__llm_tests__/cases.json @@ -6,7 +6,9 @@ "scenarioId": "double-span-beam-routing-zh", "category": "routing", "locale": "zh", - "messages": ["双跨连续梁,总长12m,两跨各6m"], + "messages": [ + "双跨连续梁,总长12m,两跨各6m" + ], "variants": { "legacy": { "expect": { @@ -15,28 +17,6 @@ } } } - }, - { - "scenarioId": "double-span-beam-basic", - "category": "extraction", - "locale": "zh", - "messages": ["双跨连续梁,总长12m,两跨各6m,均布荷载18kN/m"], - "variants": { - "specific": { - "enabledSkillIds": ["double-span-beam"], - "fallbackPolicy": "forbid-generic", - "expect": { - "routing": { "structuralSkillId": "double-span-beam" } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } } ] } diff --git a/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json index 3118adff..6768b089 100644 --- a/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json +++ b/backend/src/agent-skills/structure-type/frame/__llm_tests__/cases.json @@ -6,7 +6,9 @@ "scenarioId": "frame-routing-zh", "category": "routing", "locale": "zh", - "messages": ["三层单跨钢框架,层高3.6m,跨度6m"], + "messages": [ + "三层单跨钢框架,层高3.6m,跨度6m" + ], "variants": { "legacy": { "expect": { @@ -20,7 +22,9 @@ "scenarioId": "frame-routing-en", "category": "routing", "locale": "en", - "messages": ["3-story single-bay steel frame, story height 3.6m, bay width 6m"], + "messages": [ + "3-story single-bay steel frame, story height 3.6m, bay width 6m" + ], "variants": { "legacy": { "expect": { @@ -29,179 +33,6 @@ } } } - }, - { - "scenarioId": "frame-extraction-multi-story", - "category": "extraction", - "locale": "zh", - "messages": ["4层2跨钢框架,层高3.3m,跨度5.4m,楼面荷载12kN/m²"], - "variants": { - "specific": { - "enabledSkillIds": ["frame"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "frame", - "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"], - "draftPatch": { - "storyCount": 4, - "bayCount": 2, - "storyHeightsM": { "value": [3.3, 3.3, 3.3, 3.3], "tolerance": 0.05 }, - "bayWidthsM": { "value": [5.4, 5.4], "tolerance": 0.05 } - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "frame-params-en", - "category": "extraction", - "locale": "en", - "messages": ["2-story 3-bay frame, story height 3.6m, bay widths 6m, 7.5m, 6m"], - "variants": { - "legacy": { - "expect": { - "inferredType": "frame", - "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"], - "draftPatch": { - "storyCount": 2, - "bayCount": 3, - "bayWidthsM": { "value": [6, 7.5, 6], "tolerance": 0.05 } - } - } - } - } - }, - { - "scenarioId": "frame-steel-zh", - "category": "extraction", - "locale": "zh", - "messages": ["6层3跨钢框架Q345,层高3.6m,跨度7.2m"], - "variants": { - "legacy": { - "expect": { - "inferredType": "frame", - "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"], - "draftPatch": { - "storyCount": 6, - "bayCount": 3, - "frameMaterial": "Q345" - } - } - } - } - }, - { - "scenarioId": "frame-3d-zh", - "category": "extraction", - "locale": "zh", - "messages": ["3层2x3跨空间框架,层高4m,X向跨度6m,Y向跨度7.5m"], - "variants": { - "legacy": { - "expect": { - "inferredType": "frame", - "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCountX", "bayCountY", "bayWidthsXM", "bayWidthsYM"], - "draftPatch": { - "frameDimension": "3d", - "storyCount": 3, - "bayCountX": 2, - "bayCountY": 3 - } - } - } - } - }, - { - "scenarioId": "frame-static-basic", - "category": "pipeline", - "locale": "en", - "messages": ["2-story single-bay steel frame, story height 3.6m, bay 6m, floor load 10kN/m2, analyze and check against steel code"], - "variants": { - "specific": { - "enabledSkillIds": ["frame", "opensees-static", "code-check-gb50017", "validation-structure-model", "postprocess-builtin"], - "fallbackPolicy": "forbid-generic", - "expect": { - "routing": { - "selectedSkillIds": ["code-check-gb50017", "frame", "opensees-static", "validation-structure-model", "postprocess-builtin"], - "structuralSkillId": "frame", - "analysisSkillId": "opensees-static" - }, - "toolCalls": ["build_model", "run_analysis", "run_code_check"], - "toolAuthorizers": { - "build_model": ["frame"], - "run_analysis": ["opensees-static"], - "run_code_check": ["code-check-gb50017"] - } - } - }, - "generic": { - "enabledSkillIds": ["generic", "opensees-static", "code-check-gb50017", "validation-structure-model", "postprocess-builtin"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { - "selectedSkillIds": ["code-check-gb50017", "generic", "opensees-static", "validation-structure-model", "postprocess-builtin"], - "structuralSkillId": "generic", - "analysisSkillId": "opensees-static" - }, - "toolCalls": ["build_model", "run_analysis", "run_code_check"], - "toolAuthorizers": { - "run_code_check": ["code-check-gb50017"] - } - } - } - } - }, - { - "scenarioId": "frame-pipeline-multi-bay-zh", - "category": "pipeline", - "locale": "zh", - "messages": ["3层2跨框架,层高3.3m,跨度5.4m和6m,每层楼面荷载15kN/m,请进行静力分析并输出报告"], - "enabledSkillIds": ["frame", "opensees-static", "validation-structure-model"], - "variants": { - "legacy": { - "expect": { - "success": true, - "toolCalls": ["build_model", "run_analysis"], - "modelBuilt": true, - "analysisSuccess": true - } - } - } - }, - { - "scenarioId": "frame-clarify-en", - "category": "clarification", - "locale": "en", - "turns": [ - { - "message": "a steel frame", - "assertions": { - "criticalMissingIncludes": ["storyCount"], - "modelBuilt": false - } - }, - { - "message": "3 stories, 4.2m each, single bay 8m, floor load 12kN/m2", - "assertions": { - "criticalMissingNotIncludes": ["frameDimension", "storyCount", "storyHeightsM", "bayCount", "bayWidthsM"], - "draftPatch": { - "storyCount": 3, - "bayCount": 1 - } - } - } - ], - "variants": { - "legacy": { - "expect": {} - } - } } ] } diff --git a/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json index bdad1f62..b24b2864 100644 --- a/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json +++ b/backend/src/agent-skills/structure-type/portal-frame/__llm_tests__/cases.json @@ -6,7 +6,9 @@ "scenarioId": "portal-frame-routing-zh", "category": "routing", "locale": "zh", - "messages": ["门式刚架,跨度18m,高度6m"], + "messages": [ + "门式刚架,跨度18m,高度6m" + ], "variants": { "legacy": { "expect": { @@ -20,7 +22,9 @@ "scenarioId": "portal-frame-routing-en", "category": "routing", "locale": "en", - "messages": ["portal frame, span 18m, height 6m"], + "messages": [ + "portal frame, span 18m, height 6m" + ], "variants": { "legacy": { "expect": { @@ -29,166 +33,6 @@ } } } - }, - { - "scenarioId": "portal-frame-params-zh", - "category": "extraction", - "locale": "zh", - "messages": ["门式刚架,跨度24m,高度8m,屋面荷载5kN/m"], - "variants": { - "specific": { - "enabledSkillIds": ["portal-frame"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "portal-frame", - "criticalMissing": [], - "draftPatch": { - "spanLengthM": { - "value": 24, - "tolerance": 0.05 - }, - "heightM": { - "value": 8, - "tolerance": 0.05 - }, - "loadKN": { - "value": 5, - "tolerance": 0.05 - } - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "portal-frame-params-en", - "category": "extraction", - "locale": "en", - "messages": ["portal frame, span 21m, height 7.5m, roof load 8kN"], - "variants": { - "specific": { - "enabledSkillIds": ["portal-frame"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "portal-frame", - "criticalMissing": [], - "draftPatch": { - "spanLengthM": { - "value": 21, - "tolerance": 0.05 - }, - "heightM": { - "value": 7.5, - "tolerance": 0.05 - }, - "loadKN": { - "value": 8, - "tolerance": 0.05 - } - } - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "portal-frame-pipeline-zh", - "category": "pipeline", - "locale": "zh", - "messages": ["门式刚架,跨度18m,高度7m,屋面荷载6kN/m,分析"], - "variants": { - "specific": { - "enabledSkillIds": ["portal-frame", "opensees-static", "validation-structure-model"], - "fallbackPolicy": "forbid-generic", - "expect": { - "routing": { - "structuralSkillId": "portal-frame", - "analysisSkillId": "opensees-static" - }, - "success": true, - "toolCalls": [ - "build_model", - "run_analysis" - ], - "modelBuilt": true, - "analysisSuccess": true - } - }, - "generic": { - "enabledSkillIds": ["generic", "opensees-static"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { - "structuralSkillId": "generic", - "analysisSkillId": "opensees-static" - } - } - } - } - }, - { - "scenarioId": "portal-frame-clarify-zh", - "category": "clarification", - "locale": "zh", - "turns": [ - { - "message": "门式刚架", - "assertions": { - "criticalMissingIncludes": [ - "spanLengthM" - ], - "modelBuilt": false - } - }, - { - "message": "跨度24m,高8m", - "assertions": { - "criticalMissingIncludes": [ - "loadKN" - ], - "modelBuilt": false - } - }, - { - "message": "荷载10kN/m", - "assertions": { - "criticalMissing": [], - "modelBuilt": true, - "draftPatch": { - "spanLengthM": { - "value": 24, - "tolerance": 0.05 - }, - "heightM": { - "value": 8, - "tolerance": 0.05 - }, - "loadKN": { - "value": 10, - "tolerance": 0.05 - } - } - } - } - ], - "variants": { - "legacy": { - "expect": {} - } - } } ] } diff --git a/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json b/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json index 51505500..95b31439 100644 --- a/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json +++ b/backend/src/agent-skills/structure-type/truss/__llm_tests__/cases.json @@ -6,7 +6,9 @@ "scenarioId": "truss-routing-zh", "category": "routing", "locale": "zh", - "messages": ["三角桁架,跨度12m,高3m,节点荷载20kN"], + "messages": [ + "三角桁架,跨度12m,高3m,节点荷载20kN" + ], "variants": { "legacy": { "expect": { @@ -15,62 +17,6 @@ } } } - }, - { - "scenarioId": "truss-extraction-zh", - "category": "extraction", - "locale": "zh", - "messages": ["三角桁架,跨度12m,高3m,节点荷载20kN"], - "variants": { - "specific": { - "enabledSkillIds": ["truss"], - "fallbackPolicy": "forbid-generic", - "expect": { - "inferredType": "truss" - } - }, - "generic": { - "enabledSkillIds": ["generic"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { "structuralSkillId": "generic" } - } - } - } - }, - { - "scenarioId": "truss-static-basic", - "category": "pipeline", - "locale": "zh", - "messages": ["三角桁架,跨度12m,高3m,节点荷载20kN,做静力分析"], - "variants": { - "specific": { - "enabledSkillIds": ["truss", "opensees-static", "validation-structure-model"], - "fallbackPolicy": "forbid-generic", - "expect": { - "routing": { - "structuralSkillId": "truss", - "analysisSkillId": "opensees-static" - }, - "success": true, - "toolCalls": ["build_model", "run_analysis"], - "analysisSuccess": true - } - }, - "generic": { - "enabledSkillIds": ["generic", "opensees-static", "validation-structure-model"], - "fallbackPolicy": "require-generic", - "expect": { - "routing": { - "structuralSkillId": "generic", - "analysisSkillId": "opensees-static" - }, - "success": true, - "toolCalls": ["build_model", "run_analysis"], - "analysisSuccess": true - } - } - } } ] } diff --git a/tests/llm-benchmark/lib/evaluate.cjs b/tests/llm-benchmark/lib/evaluate.cjs index 68afa30b..7c1ca551 100644 --- a/tests/llm-benchmark/lib/evaluate.cjs +++ b/tests/llm-benchmark/lib/evaluate.cjs @@ -1,65 +1,248 @@ /** * Evaluate a completed AgentState against a scenario's expectations. * + * Supports v2 assertions (type-dispatched) with automatic v1 backward compatibility. * Returns a structured result with per-metric pass/fail and an overall score. + * + * evaluateScenario is async because natural_language assertions use LLM-as-Judge. */ -function evaluateScenario(scenario, state, durationMs) { - const metrics = []; - const expect = scenario.expect || {}; - // Structural type detection - if (expect.structuralType) { - const actual = state.structuralTypeKey || null; - metrics.push({ - metric: "structuralType", - pass: actual === expect.structuralType, - expected: expect.structuralType, - actual: actual || "(none)", - }); +const { extractSkillTrace } = require("./skill-trace.cjs"); +const { evaluateNaturalLanguage } = require("./judge.cjs"); + +const ANALYSIS_RESULT_KEYS = [ + "displacements", "nodeDisplacements", "reactions", + "nodeReactions", "memberForces", "forces", +]; + +// --------------------------------------------------------------------------- +// v1 → v2 auto-upgrade +// --------------------------------------------------------------------------- + +/** + * Upgrade a v1 scenario expect object to the v2 assertions array format. + * v2 format is used when `expect.assertions` is already present. + * + * @param {object} expect - scenario.expect + * @returns {{ assertions: object[] }} + */ +function upgradeExpect(expect) { + if (Array.isArray(expect.assertions)) { + return { assertions: expect.assertions }; } - // Model building + // v1 → v2 conversion + const assertions = []; + + if (expect.structuralType) { + assertions.push({ type: "structural_type", expected: expect.structuralType }); + } if (expect.hasModel) { - const model = state.model; - const nodes = Array.isArray(model?.nodes) ? model.nodes : []; - const elements = Array.isArray(model?.elements) ? model.elements : []; - const minNodes = expect.minNodes ?? 2; - const minElements = expect.minElements ?? 1; - metrics.push({ - metric: "model", - pass: !!model && nodes.length >= minNodes && elements.length >= minElements, - expected: `>= ${minNodes} nodes, >= ${minElements} elements`, - actual: model ? `${nodes.length} nodes, ${elements.length} elements` : "(none)", + assertions.push({ + type: "has_model", + minNodes: expect.minNodes ?? 2, + minElements: expect.minElements ?? 1, }); } - - // Analysis completion if (expect.hasAnalysis) { - const analysis = state.analysisResult; - const hasDisplacements = analysis && ( - Array.isArray(analysis.displacements) || Array.isArray(analysis.nodeDisplacements) - ); - metrics.push({ - metric: "analysis", - pass: !!analysis && (hasDisplacements || Object.keys(analysis).length > 0), + assertions.push({ type: "has_analysis" }); + } + if (expect.hasReport) { + assertions.push({ type: "has_report" }); + } + + return { assertions }; +} + +// --------------------------------------------------------------------------- +// Typed assertion evaluators +// --------------------------------------------------------------------------- + +function evalStructuralType(assertion, state) { + const actual = state.structuralTypeKey || null; + return { + metric: "structural_type", + pass: actual === assertion.expected, + expected: assertion.expected, + actual: actual || "(none)", + }; +} + +function evalHasModel(assertion, state) { + const model = state.model; + const nodes = Array.isArray(model?.nodes) ? model.nodes : []; + const elements = Array.isArray(model?.elements) ? model.elements : []; + const minNodes = assertion.minNodes ?? 2; + const minElements = assertion.minElements ?? 1; + return { + metric: "has_model", + pass: !!model && nodes.length >= minNodes && elements.length >= minElements, + expected: `>= ${minNodes} nodes, >= ${minElements} elements`, + actual: model ? `${nodes.length} nodes, ${elements.length} elements` : "(none)", + }; +} + +function hasResultField(obj, names) { + if (!obj || typeof obj !== "object") return false; + for (const name of names) { + const val = obj[name]; + if (!val) continue; + if (Array.isArray(val) && val.length > 0) return true; + if (typeof val === "object" && Object.keys(val).length > 0) return true; + } + return false; +} + +function evalHasAnalysis(_assertion, state) { + const analysis = state.analysisResult; + if (!analysis) { + return { + metric: "has_analysis", + pass: false, expected: "analysis results present", - actual: analysis ? "present" : "(none)", - }); + actual: "(none)", + }; } + const pass = hasResultField(analysis, ANALYSIS_RESULT_KEYS) || hasResultField(analysis.data, ANALYSIS_RESULT_KEYS); + return { + metric: "has_analysis", + pass, + expected: "analysis results with displacements, reactions, or forces", + actual: pass ? "present" : `keys: ${Object.keys(analysis).join(", ") || "(empty)"}`, + }; +} - // Report generation - if (expect.hasReport) { - const report = state.report; - const mdLength = typeof report?.markdown === "string" ? report.markdown.length : 0; - metrics.push({ - metric: "report", - pass: mdLength > 100, - expected: "markdown > 100 chars", - actual: report ? `${mdLength} chars` : "(none)", - }); +function evalHasReport(_assertion, state) { + const report = state.report; + const mdLength = typeof report?.markdown === "string" ? report.markdown.length : 0; + return { + metric: "has_report", + pass: mdLength > 100, + expected: "markdown > 100 chars", + actual: report ? `${mdLength} chars` : "(none)", + }; +} + +function evalSkillMatch(assertion, state) { + const trace = extractSkillTrace(Array.isArray(state.messages) ? state.messages : []); + const actual = trace?.skillId || null; + const primary = assertion.primary; + const mayAlsoMatch = Array.isArray(assertion.mayAlsoMatch) ? assertion.mayAlsoMatch : []; + const allowed = primary ? [primary, ...mayAlsoMatch] : mayAlsoMatch; + + // If no allowed skills specified, match any non-null skill + if (allowed.length === 0) { + return { + metric: "skill_match", + pass: actual !== null, + expected: "(any skill)", + actual: actual || "(none)", + }; + } + + const pass = actual !== null && allowed.includes(actual); + return { + metric: "skill_match", + pass, + expected: primary ? `${primary}${mayAlsoMatch.length ? ` (or: ${mayAlsoMatch.join(", ")})` : ""}` : "(any)", + actual: actual || "(none)", + }; +} + +function evalHasInteractionQuestions(_assertion, state) { + const messages = Array.isArray(state.messages) ? state.messages : []; + const hasQuestions = messages.some((msg) => { + if (msg.type !== "ai" && msg.role !== "assistant") return false; + if (Array.isArray(msg.tool_calls)) { + return msg.tool_calls.some( + (tc) => tc.name === "ask_user_clarification", + ); + } + return false; + }); + return { + metric: "has_interaction_questions", + pass: hasQuestions, + expected: "agent asks user for missing parameters", + actual: hasQuestions ? "questions found" : "no questions asked", + }; +} + +async function evalNaturalLanguage(assertion, state) { + const result = await evaluateNaturalLanguage(assertion.description, state); + const suffix = result.reason ? ` — ${result.reason}` : ""; + return { + metric: "natural_language", + pass: result.pass, + expected: assertion.description, + actual: result.pass ? "satisfied" : `not satisfied${suffix}`, + }; +} + +// --------------------------------------------------------------------------- +// Main evaluator +// --------------------------------------------------------------------------- + +/** + * Dispatch a single assertion to its typed evaluator. + * + * @param {object} assertion - v2 assertion object + * @param {object} state - AgentState + * @returns {Promise<{ metric: string, pass: boolean, expected: string, actual: string }>} + */ +async function dispatchAssertion(assertion, state) { + switch (assertion.type) { + case "structural_type": + return evalStructuralType(assertion, state); + case "has_model": + return evalHasModel(assertion, state); + case "has_analysis": + return evalHasAnalysis(assertion, state); + case "has_report": + return evalHasReport(assertion, state); + case "skill_match": + return evalSkillMatch(assertion, state); + case "has_interaction_questions": + return evalHasInteractionQuestions(assertion, state); + case "natural_language": + return evalNaturalLanguage(assertion, state); + default: + return { + metric: `unknown:${assertion.type || "undefined"}`, + pass: false, + expected: "valid assertion type (structural_type, has_model, has_analysis, has_report, skill_match, has_interaction_questions, natural_language)", + actual: `unsupported type: ${assertion.type || "(undefined)"}`, + }; + } +} + +/** + * Evaluate a completed AgentState against a scenario's expectations. + * + * @param {object} scenario - benchmark scenario (v1 or v2 format) + * @param {object} state - AgentState returned by service.runFull + * @param {number} durationMs - elapsed time in milliseconds + * @returns {Promise} evaluation result + */ +async function evaluateScenario(scenario, state, durationMs) { + const metrics = []; + const { assertions } = upgradeExpect(scenario.expect || {}); + + for (const assertion of assertions) { + try { + metrics.push(await dispatchAssertion(assertion, state)); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + metrics.push({ + metric: assertion.type || "unknown", + pass: false, + expected: "(assertion ran without error)", + actual: `error: ${msg}`, + }); + } } - // Tool call count (informational) + // Tool call count (informational — always measured) let toolCallCount = 0; const messages = Array.isArray(state.messages) ? state.messages : []; for (const msg of messages) { diff --git a/tests/llm-benchmark/lib/judge.cjs b/tests/llm-benchmark/lib/judge.cjs new file mode 100644 index 00000000..934fa2d6 --- /dev/null +++ b/tests/llm-benchmark/lib/judge.cjs @@ -0,0 +1,234 @@ +/** + * LLM-as-Judge evaluator for natural_language assertions. + * + * Configuration (via environment variables): + * LLM_JUDGE_MODEL — model to use (falls back to LLM_MODEL, then "gpt-4o-mini") + * LLM_JUDGE_API_KEY — API key (falls back to LLM_API_KEY) + * LLM_JUDGE_BASE_URL — base URL (falls back to LLM_BASE_URL, then "https://api.openai.com") + * + * Fixed parameters: temperature=0, max_tokens=500, timeout=30s + */ + +const https = require("node:https"); + +const JUDGE_TEMPERATURE = 0; +const JUDGE_MAX_TOKENS = 500; +const JUDGE_TIMEOUT_MS = 30_000; +const MAX_RESPONSE_BODY = 100_000; // 100KB + +/** + * Build a compact summary of the agent output for the judge prompt. + * @param {object} state - AgentState returned by runFull + * @returns {string} + */ +function summarizeAgentOutput(state) { + const parts = []; + + if (state.structuralTypeKey) { + parts.push(`Structural type: ${state.structuralTypeKey}`); + } + + if (state.model) { + const nodes = Array.isArray(state.model.nodes) ? state.model.nodes.length : 0; + const elements = Array.isArray(state.model.elements) ? state.model.elements.length : 0; + parts.push(`Model: ${nodes} nodes, ${elements} elements`); + } + + if (state.analysisResult) { + const keys = Object.keys(state.analysisResult).filter((k) => k !== "_raw").join(", "); + parts.push(`Analysis result keys: ${keys || "(present)"}`); + const displacements = + state.analysisResult.displacements || state.analysisResult.nodeDisplacements; + if (Array.isArray(displacements) && displacements.length > 0) { + parts.push(`Sample displacement: ${JSON.stringify(displacements[0])}`); + } + const reactions = state.analysisResult.reactions || state.analysisResult.nodeReactions; + if (Array.isArray(reactions) && reactions.length > 0) { + parts.push(`Sample reaction: ${JSON.stringify(reactions[0])}`); + } + } + + if (state.report?.markdown) { + parts.push(`Report excerpt: ${state.report.markdown.slice(0, 500)}`); + } + + return parts.length > 0 ? parts.join("\n") : "(no agent output)"; +} + +/** + * Build the judge prompt. + * @param {string} description - natural language criterion + * @param {string} agentOutput - compact summary of agent state + * @returns {string} + */ +function buildJudgePrompt(description, agentOutput) { + return [ + "You are a structural engineering test evaluator.", + "Based on the agent output below, judge whether the following criterion is satisfied.", + "", + `Criterion: ${description}`, + "", + "Agent output:", + agentOutput, + "", + 'Respond ONLY with a JSON object on a single line: {"pass": true} or {"pass": false, "reason": "brief explanation"}', + "Do not include any other text.", + ].join("\n"); +} + +/** + * Extract JSON from LLM response, handling markdown fences and nested braces. + * @param {string} response - raw LLM response + * @returns {object|null} parsed object or null + */ +function extractJudgeJson(response) { + let text = response.trim(); + + // Strip markdown code fences + const fenceMatch = text.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/); + if (fenceMatch) { + text = fenceMatch[1].trim(); + } + + // Try full parse first + try { + return JSON.parse(text); + } catch { + // Fall through to brace-matching + } + + // Find balanced braces — greedy match from first { to last } + const start = text.indexOf("{"); + const end = text.lastIndexOf("}"); + if (start !== -1 && end > start) { + try { + return JSON.parse(text.slice(start, end + 1)); + } catch { + return null; + } + } + return null; +} + +/** + * Call the LLM judge API (HTTPS only). + * @param {string} prompt + * @returns {Promise} raw response text + */ +function callLlmJudgeApi(prompt) { + const apiKey = process.env.LLM_JUDGE_API_KEY || process.env.LLM_API_KEY; + if (!apiKey) { + throw new Error("LLM_JUDGE_API_KEY or LLM_API_KEY is required for judge evaluation"); + } + const model = process.env.LLM_JUDGE_MODEL || process.env.LLM_MODEL || "gpt-4o-mini"; + const rawBase = + process.env.LLM_JUDGE_BASE_URL || process.env.LLM_BASE_URL || "https://api.openai.com"; + let base = rawBase.endsWith("/") ? rawBase.slice(0, -1) : rawBase; + + // Build URL handling bases that already include /v1 or other versioned paths + const chatPath = /\/v\d+$/.test(base) ? "/chat/completions" : "/v1/chat/completions"; + const url = new URL(`${base}${chatPath}`); + if (url.protocol !== "https:") { + throw new Error(`Judge API must use HTTPS, got: ${url.protocol}`); + } + + const bodyStr = JSON.stringify({ + model, + temperature: JUDGE_TEMPERATURE, + max_tokens: JUDGE_MAX_TOKENS, + messages: [{ role: "user", content: prompt }], + }); + + return new Promise((resolve, reject) => { + let settled = false; + + const req = https.request( + { + hostname: url.hostname, + port: url.port || 443, + path: url.pathname + url.search, + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + "Content-Length": Buffer.byteLength(bodyStr), + }, + }, + (res) => { + let data = ""; + res.on("data", (chunk) => { + data += chunk; + if (data.length > MAX_RESPONSE_BODY) { + if (!settled) { + settled = true; + req.destroy(new Error("Judge response body exceeded 100KB limit")); + } + } + }); + res.on("end", () => { + if (settled) return; + if (res.statusCode && res.statusCode >= 400) { + settled = true; + reject( + new Error(`Judge API returned HTTP ${res.statusCode}`), + ); + return; + } + try { + const parsed = JSON.parse(data); + const content = parsed.choices?.[0]?.message?.content ?? ""; + settled = true; + resolve(content.trim()); + } catch { + settled = true; + reject(new Error(`Failed to parse judge response: ${data.slice(0, 100)}`)); + } + }); + }, + ); + + req.setTimeout(JUDGE_TIMEOUT_MS, () => { + if (!settled) { + settled = true; + req.destroy(new Error("LLM judge request timed out after 30s")); + } + }); + req.on("error", (err) => { + if (!settled) { + settled = true; + reject(err); + } + }); + req.write(bodyStr); + req.end(); + }); +} + +/** + * Evaluate a natural_language assertion against the agent state using LLM-as-Judge. + * + * @param {string} description - the natural language criterion to evaluate + * @param {object} state - AgentState returned by runFull + * @returns {Promise<{ pass: boolean, reason?: string }>} + */ +async function evaluateNaturalLanguage(description, state) { + const agentOutput = summarizeAgentOutput(state); + const prompt = buildJudgePrompt(description, agentOutput); + + try { + const response = await callLlmJudgeApi(prompt); + const result = extractJudgeJson(response); + if (!result) { + return { pass: false, reason: `Judge returned non-JSON: ${response.slice(0, 100)}` }; + } + return { + pass: result.pass === true, + reason: typeof result.reason === "string" ? result.reason : undefined, + }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { pass: false, reason: `Judge error: ${msg}` }; + } +} + +module.exports = { evaluateNaturalLanguage }; diff --git a/tests/llm-benchmark/lib/report.cjs b/tests/llm-benchmark/lib/report.cjs index 1cc5dc02..46754949 100644 --- a/tests/llm-benchmark/lib/report.cjs +++ b/tests/llm-benchmark/lib/report.cjs @@ -3,6 +3,11 @@ function formatMetric(m) { return ` ${icon} ${m.metric.padEnd(16)} expected: ${m.expected.padEnd(30)} actual: ${m.actual}`; } +function formatTurnMetric(m) { + const icon = m.pass ? "\u2713" : "\u2717"; + return ` ${icon} ${m.metric.padEnd(14)} ${m.actual}`; +} + function printScenarioResult(scenario, evaluation) { const status = evaluation.allPassed ? "PASS" : "FAIL"; process.stdout.write(`\n${"=".repeat(60)}\n`); @@ -10,9 +15,30 @@ function printScenarioResult(scenario, evaluation) { if (scenario.description) { process.stdout.write(` ${scenario.description}\n`); } - process.stdout.write(`\n`); - for (const m of evaluation.metrics) { - process.stdout.write(formatMetric(m) + "\n"); + + if (evaluation.turnResults && Array.isArray(evaluation.turnResults)) { + const turns = scenario.turns || []; + for (const { turnIndex, evaluation: turnEval } of evaluation.turnResults) { + const turnMsg = turns[turnIndex]?.message || "(turn)"; + const preview = turnMsg.length > 40 ? turnMsg.slice(0, 40) + "..." : turnMsg; + process.stdout.write(`\n Turn ${turnIndex + 1}: "${preview}"\n`); + for (const m of turnEval.metrics) { + if (m.metric === "duration") continue; + if (m.metric === "toolCalls" && m.pass) continue; + process.stdout.write(formatTurnMetric(m) + "\n"); + } + } + process.stdout.write(`\n`); + for (const m of evaluation.metrics) { + if (m.metric === "duration") { + process.stdout.write(formatMetric(m) + "\n"); + } + } + } else { + process.stdout.write(`\n`); + for (const m of evaluation.metrics) { + process.stdout.write(formatMetric(m) + "\n"); + } } process.stdout.write(`${"=".repeat(60)}\n`); } diff --git a/tests/llm-benchmark/lib/skill-trace.cjs b/tests/llm-benchmark/lib/skill-trace.cjs new file mode 100644 index 00000000..e62cc420 --- /dev/null +++ b/tests/llm-benchmark/lib/skill-trace.cjs @@ -0,0 +1,39 @@ +/** + * Extract skill match results from Agent messages by parsing + * detect_structure_type tool result messages. + * + * LangGraph stores tool results as ToolMessage objects in state.messages. + * These have msg.name === 'detect_structure_type' and msg.content as JSON. + * + * @param {unknown[]} messages - state.messages from AgentState + * @returns {{ skillId: string|null, structureType: string|null, mappedType: string|null } | null} + */ +function extractSkillTrace(messages) { + if (!Array.isArray(messages)) return null; + + // Scan from end to find the most recent routing decision (important for multi-turn) + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if (!msg || typeof msg !== 'object') continue; + + // LangChain ToolMessage: msg.name === tool name + if (msg.name !== 'detect_structure_type') continue; + + try { + const content = typeof msg.content === 'string' ? JSON.parse(msg.content) : msg.content; + if (content && typeof content === 'object') { + return { + skillId: content.skillId || null, + structureType: content.key || null, + mappedType: content.mappedType || null, + }; + } + } catch { + // ignore JSON parse errors; continue to next message + } + } + + return null; +} + +module.exports = { extractSkillTrace }; diff --git a/tests/llm-benchmark/runner.cjs b/tests/llm-benchmark/runner.cjs index 80aea53a..dfecf95b 100644 --- a/tests/llm-benchmark/runner.cjs +++ b/tests/llm-benchmark/runner.cjs @@ -2,7 +2,6 @@ const path = require("node:path"); const fs = require("node:fs"); const { pathToFileURL } = require("node:url"); -const { resolveIntegrationContext } = require("../llm-integration/lib/context.js"); const { evaluateScenario } = require("./lib/evaluate.cjs"); const { printScenarioResult, printSummary, writeJsonOutput } = require("./lib/report.cjs"); @@ -46,9 +45,63 @@ function parseBenchmarkOptions(args) { return { scenarioId, outputPath }; } +function normalizeScenario(scenario) { + if (scenario.turns) { + return { ...scenario, _multiTurn: true }; + } + return { + ...scenario, + _multiTurn: false, + turns: [ + { + message: scenario.message, + assertions: scenario.expect?.assertions, + }, + ], + }; +} + +function mergeTurnResults(scenario, turnResults, totalDurationMs) { + const allMetrics = []; + let allPassed = true; + let passed = 0; + let total = 0; + + for (const { evaluation } of turnResults) { + // Keep per-turn toolCalls and duration as informational + const coreMetrics = evaluation.metrics.filter( + (m) => m.metric !== "toolCalls" && m.metric !== "duration", + ); + allMetrics.push(...coreMetrics); + passed += coreMetrics.filter((m) => m.pass).length; + total += coreMetrics.length; + if (!evaluation.allPassed) allPassed = false; + } + + // Add overall informational metrics + allMetrics.push({ + metric: "duration", + pass: true, + expected: "(info)", + actual: `${(totalDurationMs / 1000).toFixed(1)}s`, + }); + + return { + scenarioId: scenario.id, + description: scenario.description || "", + passed, + total: total + 1, + allPassed, + metrics: allMetrics, + durationMs: totalDurationMs, + turnResults: turnResults.map((r) => ({ turnIndex: r.turnIndex, evaluation: r.evaluation })), + }; +} + async function runBenchmark(rootDir, args) { const options = parseBenchmarkOptions(args); - const context = resolveIntegrationContext(rootDir); + const { resolveRegressionContext } = require("../regression/shared.js"); + const context = resolveRegressionContext(rootDir); // Inject LLM env vars for (const [k, v] of Object.entries(context.env)) { @@ -89,36 +142,100 @@ async function runBenchmark(rootDir, args) { const results = []; - for (const scenario of scenarios) { - process.stdout.write(`\nRunning: ${scenario.id}...\n`); - const startTime = Date.now(); + for (const rawScenario of scenarios) { + const scenario = normalizeScenario(rawScenario); + const maxRetries = Math.max(0, typeof scenario.maxRetries === "number" ? scenario.maxRetries : 0); + let attempt = 0; + let lastEvaluation = null; + + while (attempt <= maxRetries) { + if (attempt > 0) { + process.stdout.write(` Retrying (attempt ${attempt + 1}/${maxRetries + 1})...\n`); + } else { + process.stdout.write(`\nRunning: ${scenario.id}...\n`); + } + + const scenarioStart = Date.now(); + let executionError = false; + const turnResults = []; + const conversationId = `bench-${scenario.id}-${scenarioStart}-${attempt}`; + + // Suppress noisy agent logs during execution + const prevLogLevel = process.env.LOG_LEVEL; + process.env.LOG_LEVEL = "warn"; + + try { + for (let i = 0; i < scenario.turns.length; i++) { + const turn = scenario.turns[i]; + const turnStart = Date.now(); + + const state = await service.runFull({ + message: turn.message, + conversationId, + context: { locale: scenario.locale || "zh" }, + }); + + const turnDurationMs = Date.now() - turnStart; + + if (turn.assertions && turn.assertions.length > 0) { + const turnScenario = { ...scenario, expect: { assertions: turn.assertions } }; + const evaluation = await evaluateScenario(turnScenario, state, turnDurationMs); + turnResults.push({ turnIndex: i, evaluation }); + } + } + } catch (err) { + executionError = true; + const message = err instanceof Error ? err.message : String(err); + process.stdout.write(` error: ${message}\n`); + turnResults.push({ + turnIndex: scenario.turns.length - 1, + evaluation: { + scenarioId: scenario.id, + description: scenario.description || "", + passed: 0, + total: 1, + allPassed: false, + metrics: [{ metric: "execution", pass: false, expected: "no error", actual: message }], + durationMs: Date.now() - scenarioStart, + }, + }); + } finally { + if (prevLogLevel === undefined) { + delete process.env.LOG_LEVEL; + } else { + process.env.LOG_LEVEL = prevLogLevel; + } + } + + const totalDurationMs = Date.now() - scenarioStart; + + if (turnResults.length > 0) { + if (scenario._multiTurn) { + lastEvaluation = mergeTurnResults(scenario, turnResults, totalDurationMs); + } else { + lastEvaluation = turnResults[0].evaluation; + } + } + + if (lastEvaluation && lastEvaluation.allPassed) break; + if (executionError) break; + attempt += 1; + } - try { - const state = await service.runFull({ - message: scenario.message, - conversationId: `bench-${scenario.id}-${startTime}`, - context: { locale: scenario.locale || "zh" }, - }); - - const durationMs = Date.now() - startTime; - const evaluation = evaluateScenario(scenario, state, durationMs); - printScenarioResult(scenario, evaluation); - results.push(evaluation); - } catch (err) { - const durationMs = Date.now() - startTime; - const message = err instanceof Error ? err.message : String(err); - process.stdout.write(`\nFAIL ${scenario.id}\n`); - process.stdout.write(` error: ${message}\n`); - results.push({ + if (!lastEvaluation) { + lastEvaluation = { scenarioId: scenario.id, description: scenario.description || "", passed: 0, total: 1, allPassed: false, - metrics: [{ metric: "execution", pass: false, expected: "no error", actual: message }], - durationMs, - }); + metrics: [{ metric: "execution", pass: false, expected: "no evaluation produced", actual: "(none)" }], + durationMs: 0, + }; } + + printScenarioResult(rawScenario, lastEvaluation); + results.push(lastEvaluation); } printSummary(results); diff --git a/tests/llm-benchmark/scenarios/beam.json b/tests/llm-benchmark/scenarios/beam.json index 775bff8d..43fb7b69 100644 --- a/tests/llm-benchmark/scenarios/beam.json +++ b/tests/llm-benchmark/scenarios/beam.json @@ -2,40 +2,84 @@ { "id": "beam-static-6m", "description": "简支梁6米均布荷载静力分析", + "category": "static-analysis", + "tags": ["beam", "static", "zh", "basic"], "message": "简支梁6米,均布荷载20kN/m,请进行静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "beam", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 2, - "minElements": 1 + "skills": { "primary": "beam", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "beam" }, + { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 2, "minElements": 1 }, + { "type": "has_analysis" }, + { "type": "natural_language", "description": "Analysis results should include displacement or deflection values" } + ] } }, { "id": "beam-cantilever-point-load", "description": "悬臂梁端部集中力静力分析", + "category": "static-analysis", + "tags": ["beam", "static", "zh", "cantilever"], "message": "一根悬臂梁,长3米,端部集中力10kN,请进行静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "beam", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 2, - "minElements": 1 + "skills": { "primary": "beam", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "beam" }, + { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 2, "minElements": 1 }, + { "type": "has_analysis" }, + { "type": "natural_language", "description": "Analysis results should reflect a cantilever boundary condition (fixed at one end, free at the other)" } + ] } }, { "id": "beam-simply-supported-en", "description": "Simply supported beam with UDL static analysis", + "category": "static-analysis", + "tags": ["beam", "static", "en", "basic"], "message": "A simply supported beam, span 8m, uniformly distributed load 15kN/m, please run a static analysis", "locale": "en", + "maxRetries": 2, "expect": { - "structuralType": "beam", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 2, - "minElements": 1 + "skills": { "primary": "beam", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "beam" }, + { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 2, "minElements": 1 }, + { "type": "has_analysis" }, + { "type": "natural_language", "description": "Analysis results should include mid-span deflection or maximum displacement" } + ] } + }, + { + "id": "beam-multi-turn-incomplete", + "description": "简支梁参数不全→多轮补全→分析", + "category": "static-analysis", + "tags": ["beam", "static", "zh", "multi-turn"], + "locale": "zh", + "maxRetries": 1, + "turns": [ + { + "message": "帮我分析一根简支梁", + "assertions": [ + { "type": "structural_type", "expected": "beam" }, + { "type": "skill_match", "primary": "beam", "mayAlsoMatch": ["generic"] }, + { "type": "has_interaction_questions" } + ] + }, + { + "message": "跨度6米,均布荷载20kN/m", + "assertions": [ + { "type": "has_model", "minNodes": 2, "minElements": 1 }, + { "type": "has_analysis" }, + { "type": "natural_language", "description": "Analysis results should include displacement values for a 6m beam under 20kN/m UDL" } + ] + } + ] } ] diff --git a/tests/llm-benchmark/scenarios/double-span-beam.json b/tests/llm-benchmark/scenarios/double-span-beam.json index 4b42df38..b7811570 100644 --- a/tests/llm-benchmark/scenarios/double-span-beam.json +++ b/tests/llm-benchmark/scenarios/double-span-beam.json @@ -2,14 +2,19 @@ { "id": "double-span-beam-static", "description": "双跨连续梁静力分析", + "category": "static-analysis", + "tags": ["beam", "continuous", "static", "zh"], "message": "两跨连续梁,跨度5m和6m,均布荷载12kN/m,做静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "double-span-beam", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 3, - "minElements": 2 + "skills": { "primary": "double-span-beam", "mayAlsoMatch": ["beam", "generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "double-span-beam" }, + { "type": "skill_match", "primary": "double-span-beam", "mayAlsoMatch": ["beam", "generic"] }, + { "type": "has_model", "minNodes": 3, "minElements": 2 }, + { "type": "has_analysis" } + ] } } ] diff --git a/tests/llm-benchmark/scenarios/frame.json b/tests/llm-benchmark/scenarios/frame.json index 10755cbc..e53200dd 100644 --- a/tests/llm-benchmark/scenarios/frame.json +++ b/tests/llm-benchmark/scenarios/frame.json @@ -2,40 +2,57 @@ { "id": "frame-2story-1bay-static", "description": "2层单跨钢框架静力分析", + "category": "static-analysis", + "tags": ["frame", "static", "zh", "multi-story"], "message": "2层单跨钢框架,层高3.6m,跨度6m,每层楼面荷载10kN/m2,请进行静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "frame", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 6, - "minElements": 6 + "skills": { "primary": "frame", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "frame" }, + { "type": "skill_match", "primary": "frame", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 6, "minElements": 6 }, + { "type": "has_analysis" }, + { "type": "natural_language", "description": "Analysis results should include story drift or inter-story displacement values" } + ] } }, { "id": "frame-3story-2bay-static", "description": "3层2跨框架静力分析", + "category": "static-analysis", + "tags": ["frame", "static", "zh", "multi-bay"], "message": "3层2跨框架,层高3.3m,跨度5.4m和6m,每层楼面荷载15kN/m,请进行静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "frame", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 12, - "minElements": 12 + "skills": { "primary": "frame", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "frame" }, + { "type": "skill_match", "primary": "frame", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 12, "minElements": 12 }, + { "type": "has_analysis" }, + { "type": "natural_language", "description": "Model should represent a 3-story 2-bay frame with correct node and element count" } + ] } }, { "id": "frame-steel-en", "description": "2-story single-bay steel frame static analysis", + "category": "static-analysis", + "tags": ["frame", "static", "en", "multi-story"], "message": "2-story single-bay steel frame, story height 3.6m, bay width 6m, floor load 10kN/m2, please run a static analysis", "locale": "en", + "maxRetries": 2, "expect": { - "structuralType": "frame", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 6, - "minElements": 6 + "skills": { "primary": "frame", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "frame" }, + { "type": "skill_match", "primary": "frame", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 6, "minElements": 6 }, + { "type": "has_analysis" } + ] } } ] diff --git a/tests/llm-benchmark/scenarios/portal-frame.json b/tests/llm-benchmark/scenarios/portal-frame.json index cdce0322..44f6f3ec 100644 --- a/tests/llm-benchmark/scenarios/portal-frame.json +++ b/tests/llm-benchmark/scenarios/portal-frame.json @@ -2,27 +2,37 @@ { "id": "portal-frame-static-18m", "description": "门式刚架18米跨度静力分析", + "category": "static-analysis", + "tags": ["portal-frame", "static", "zh"], "message": "门式刚架,跨度18m,高度7m,屋面荷载6kN/m,做静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "portal-frame", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 3, - "minElements": 3 + "skills": { "primary": "portal-frame", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "portal-frame" }, + { "type": "skill_match", "primary": "portal-frame", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 3, "minElements": 3 }, + { "type": "has_analysis" } + ] } }, { "id": "portal-frame-static-en", "description": "Portal frame 21m span static analysis", + "category": "static-analysis", + "tags": ["portal-frame", "static", "en"], "message": "Portal frame, span 21m, height 7.5m, roof load 8kN/m, please run a static analysis", "locale": "en", + "maxRetries": 2, "expect": { - "structuralType": "portal-frame", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 3, - "minElements": 3 + "skills": { "primary": "portal-frame", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "portal-frame" }, + { "type": "skill_match", "primary": "portal-frame", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 3, "minElements": 3 }, + { "type": "has_analysis" } + ] } } ] diff --git a/tests/llm-benchmark/scenarios/truss.json b/tests/llm-benchmark/scenarios/truss.json index 59ce7b1d..ebf0f042 100644 --- a/tests/llm-benchmark/scenarios/truss.json +++ b/tests/llm-benchmark/scenarios/truss.json @@ -2,14 +2,19 @@ { "id": "truss-triangle-static", "description": "三角桁架静力分析", + "category": "static-analysis", + "tags": ["truss", "static", "zh"], "message": "三角桁架,跨度12m,高3m,节点荷载20kN,做静力分析", "locale": "zh", + "maxRetries": 2, "expect": { - "structuralType": "truss", - "hasModel": true, - "hasAnalysis": true, - "minNodes": 3, - "minElements": 3 + "skills": { "primary": "truss", "mayAlsoMatch": ["generic"] }, + "assertions": [ + { "type": "structural_type", "expected": "truss" }, + { "type": "skill_match", "primary": "truss", "mayAlsoMatch": ["generic"] }, + { "type": "has_model", "minNodes": 3, "minElements": 3 }, + { "type": "has_analysis" } + ] } } ] diff --git a/tests/llm-integration/lib/context.js b/tests/llm-integration/lib/context.js deleted file mode 100644 index d80f4a4f..00000000 --- a/tests/llm-integration/lib/context.js +++ /dev/null @@ -1,38 +0,0 @@ -const path = require("node:path"); -const runtime = require("../../../scripts/cli/runtime"); - -/** - * Resolve the integration test context: paths, env vars, and pre-flight checks. - * Throws early if LLM_API_KEY is missing. - */ -function resolveIntegrationContext(rootDir) { - const projectRoot = runtime.resolveProjectRoot(rootDir); - const { paths, env } = runtime.loadProjectEnvironment(projectRoot); - - const llmApiKey = process.env.LLM_API_KEY || env.LLM_API_KEY || ""; - const llmModel = process.env.LLM_MODEL || env.LLM_MODEL || ""; - const llmBaseUrl = process.env.LLM_BASE_URL || env.LLM_BASE_URL || ""; - - if (!llmApiKey) { - throw new Error( - "LLM_API_KEY is required for integration tests.\n" + - "Set it via environment variable or .env file." - ); - } - - return { - rootDir: projectRoot, - paths, - env: { - ...env, - LLM_API_KEY: llmApiKey, - LLM_MODEL: llmModel, - LLM_BASE_URL: llmBaseUrl, - DATABASE_URL: `file:${path - .join(projectRoot, ".structureclaw", "data", "structureclaw-llm-test.db") - .replace(/\\/gu, "/")}`, - }, - }; -} - -module.exports = { resolveIntegrationContext }; diff --git a/tests/llm-integration/lib/executors.cjs b/tests/llm-integration/lib/executors.cjs deleted file mode 100644 index 2c34f6d3..00000000 --- a/tests/llm-integration/lib/executors.cjs +++ /dev/null @@ -1,221 +0,0 @@ -const { - assert, - assertToolCalls, - applyCriticalMissingAssertions, -} = require("./assertions.js"); - -function resolveLocale(locale) { - return locale === "zh" ? "zh" : "en"; -} - -function resolveCaseExpect(testCase = {}) { - if (testCase.expect && typeof testCase.expect === "object") { - return testCase.expect; - } - if (testCase.assertions && typeof testCase.assertions === "object") { - return testCase.assertions; - } - return {}; -} - -function shouldEnableAutoCodeCheck(expected = {}) { - if (typeof expected.autoCodeCheck === "boolean") { - return expected.autoCodeCheck; - } - - return Array.isArray(expected.toolCalls) && expected.toolCalls.includes("run_code_check"); -} - -function attachExecutionResult(error, key, result) { - if (error && typeof error === "object" && !Object.prototype.hasOwnProperty.call(error, key)) { - error[key] = result; - } - return error; -} - -async function runRoutingTest(runtime, testCase) { - const locale = resolveLocale(testCase.locale); - const message = testCase.messages[0]; - const match = await runtime.detectStructuralType(message, locale, undefined, testCase.enabledSkillIds); - const expected = resolveCaseExpect(testCase); - - if (expected.inferredType) { - const actualKey = match.mappedType || match.key; - assert( - actualKey === expected.inferredType || match.skillId === expected.inferredType, - `expected inferredType="${expected.inferredType}", got key="${match.key}" mappedType="${match.mappedType}" skillId="${match.skillId}"` - ); - } - if (expected.structuralTypeKey) { - assert( - match.key === expected.structuralTypeKey || match.mappedType === expected.structuralTypeKey, - `expected structuralTypeKey="${expected.structuralTypeKey}", got key="${match.key}" mappedType="${match.mappedType}"` - ); - } -} - -async function runExtractionTest(runtime, llm, testCase) { - const locale = resolveLocale(testCase.locale); - const message = testCase.messages[0]; - const expected = resolveCaseExpect(testCase); - - const result = await runtime.textToModelDraft(llm, message, undefined, locale, testCase.enabledSkillIds); - - if (expected.inferredType) { - assert( - result.inferredType === expected.inferredType, - `expected inferredType="${expected.inferredType}", got "${result.inferredType}"` - ); - } - - if ( - expected.criticalMissing !== undefined - || expected.criticalMissingIncludes - || expected.criticalMissingNotIncludes - ) { - applyCriticalMissingAssertions(result.missingFields || [], expected); - } - - if (expected.draftPatch) { - try { - assertDraftPatch(result.stateToPersist || {}, expected.draftPatch); - } catch (error) { - throw attachExecutionResult(error, "draftResult", result); - } - } - - return result; -} - -function assertDraftPatch(state, expectedPatch) { - for (const [key, expectedValue] of Object.entries(expectedPatch)) { - const actualValue = state[key]; - if (expectedValue === null || expectedValue === undefined) continue; - - if (typeof expectedValue === "object" && expectedValue.value !== undefined) { - const tolerance = expectedValue.tolerance || 0.05; - const expected = expectedValue.value; - - if (Array.isArray(expected)) { - assert(Array.isArray(actualValue), `expected ${key} to be an array, got ${typeof actualValue}: ${actualValue}`); - assert(actualValue.length === expected.length, `expected ${key} length ${expected.length}, got ${actualValue.length}`); - for (let i = 0; i < expected.length; i++) { - const diff = Math.abs(actualValue[i] - expected[i]) / Math.abs(expected[i] || 1); - assert(diff <= tolerance, `expected ${key}[${i}]=${expected[i]} (±${(tolerance * 100).toFixed(0)}%), got ${actualValue[i]}`); - } - } else { - assert(typeof actualValue === "number", `expected ${key} to be a number, got ${typeof actualValue}: ${actualValue}`); - const diff = Math.abs(actualValue - expected) / Math.abs(expected || 1); - assert(diff <= tolerance, `expected ${key}=${expected} (±${(tolerance * 100).toFixed(0)}%), got ${actualValue}`); - } - } else { - assert( - actualValue === expectedValue, - `expected ${key}="${expectedValue}", got "${actualValue}"` - ); - } - } -} - -async function runPipelineTest(agentService, testCase) { - const locale = resolveLocale(testCase.locale); - const message = testCase.messages[0]; - const expected = resolveCaseExpect(testCase); - - const result = await agentService.run({ - message, - conversationId: `llm-test-${testCase.id}-${Date.now()}`, - traceId: `trace-${testCase.id}`, - context: { - locale, - skillIds: testCase.enabledSkillIds, - autoAnalyze: true, - includeReport: expected.expectReport !== false, - autoCodeCheck: shouldEnableAutoCodeCheck(expected), - }, - }); - - try { - if (typeof expected.success === "boolean") { - assert( - Boolean(result.success) === expected.success, - `expected pipeline success=${expected.success}, got ${Boolean(result.success)}` - ); - } - - if (expected.toolCalls) { - assertToolCalls(result.toolCalls || [], expected.toolCalls); - } - - const analysisCall = result.toolCalls?.find((tc) => tc.tool === "run_analysis"); - if (expected.analysisSuccess === true) { - assert( - analysisCall, - "expected run_analysis to execute, but no run_analysis tool call was recorded" - ); - } - - if (expected.analysisSuccess !== false && result.toolCalls) { - if (analysisCall) { - assert( - analysisCall.status === "success", - `run_analysis should succeed, got status="${analysisCall.status}"${analysisCall.error ? `, error: ${analysisCall.error}` : ""}` - ); - } - } - } catch (error) { - throw attachExecutionResult(error, "pipelineResult", result); - } - - return result; -} - -async function runClarificationTest(runtime, llm, testCase) { - const locale = resolveLocale(testCase.locale); - let currentState = undefined; - let lastResult = null; - - for (let i = 0; i < testCase.turns.length; i++) { - const turn = testCase.turns[i]; - const result = await runtime.textToModelDraft(llm, turn.message, currentState, locale, testCase.enabledSkillIds); - currentState = result.stateToPersist; - lastResult = result; - - const expected = turn.assertions; - - if ( - expected.criticalMissing !== undefined - || expected.criticalMissingIncludes - || expected.criticalMissingNotIncludes - ) { - applyCriticalMissingAssertions(result.missingFields || [], expected); - } - if (expected.modelBuilt !== undefined) { - if (expected.modelBuilt) { - assert(result.model !== undefined, `expected model to be built on turn ${i + 1}, but it was undefined`); - } else { - assert(result.model === undefined, `expected model NOT to be built on turn ${i + 1}`); - } - } - if (expected.draftPatch) { - try { - assertDraftPatch(result.stateToPersist || {}, expected.draftPatch); - } catch (error) { - throw attachExecutionResult(error, "draftResult", result); - } - } - } - - return lastResult; -} - -module.exports = { - resolveCaseExpect, - shouldEnableAutoCodeCheck, - attachExecutionResult, - runRoutingTest, - runExtractionTest, - runPipelineTest, - runClarificationTest, - assertDraftPatch, -}; diff --git a/tests/llm-integration/lib/executors.test.cjs b/tests/llm-integration/lib/executors.test.cjs deleted file mode 100644 index 725c205d..00000000 --- a/tests/llm-integration/lib/executors.test.cjs +++ /dev/null @@ -1,233 +0,0 @@ -const test = require("node:test"); -const nodeAssert = require("node:assert/strict"); - -const { - resolveCaseExpect, - runRoutingTest, - runExtractionTest, - runPipelineTest, -} = require("./executors.cjs"); - -test("resolveCaseExpect prefers v2 expect blocks", () => { - const expected = resolveCaseExpect({ - expect: { inferredType: "frame" }, - assertions: { inferredType: "beam" }, - }); - - nodeAssert.deepEqual(expected, { inferredType: "frame" }); -}); - -test("runRoutingTest forwards enabledSkillIds and uses normalized expect", async () => { - const calls = []; - const runtime = { - async detectStructuralType(message, locale, currentState, skillIds) { - calls.push({ message, locale, currentState, skillIds }); - return { key: "frame", mappedType: "frame", skillId: "frame" }; - }, - }; - - await runRoutingTest(runtime, { - locale: "en", - messages: ["3-story steel frame"], - enabledSkillIds: ["frame"], - expect: { - inferredType: "frame", - structuralTypeKey: "frame", - }, - assertions: { - inferredType: "beam", - }, - }); - - nodeAssert.deepEqual(calls, [ - { - message: "3-story steel frame", - locale: "en", - currentState: undefined, - skillIds: ["frame"], - }, - ]); -}); - -test("runExtractionTest uses normalized expect blocks", async () => { - const runtime = { - async textToModelDraft(_llm, message, currentState, locale, skillIds) { - nodeAssert.equal(message, "3-story steel frame"); - nodeAssert.equal(currentState, undefined); - nodeAssert.equal(locale, "en"); - nodeAssert.deepEqual(skillIds, ["frame"]); - return { - inferredType: "frame", - missingFields: [], - stateToPersist: { storyCount: 3 }, - }; - }, - }; - - const result = await runExtractionTest(runtime, {}, { - locale: "en", - messages: ["3-story steel frame"], - enabledSkillIds: ["frame"], - expect: { - inferredType: "frame", - criticalMissing: [], - draftPatch: { storyCount: 3 }, - }, - assertions: { - inferredType: "beam", - }, - }); - - nodeAssert.equal(result.inferredType, "frame"); -}); - -test("runPipelineTest derives context from normalized expect blocks", async () => { - const calls = []; - const agentService = { - async run(input) { - calls.push(input); - return { - toolCalls: [ - { tool: "build_model", status: "success" }, - { tool: "run_analysis", status: "success" }, - ], - }; - }, - }; - - const result = await runPipelineTest(agentService, { - id: "frame-static-basic#specific", - locale: "en", - messages: ["3-story steel frame"], - enabledSkillIds: ["frame", "opensees-static"], - expect: { - toolCalls: ["build_model", "run_analysis"], - expectReport: false, - }, - assertions: { - expectReport: true, - }, - }); - - nodeAssert.equal(result.toolCalls.length, 2); - nodeAssert.equal(calls.length, 1); - nodeAssert.equal(calls[0].context.includeReport, false); - nodeAssert.deepEqual(calls[0].context.skillIds, ["frame", "opensees-static"]); - nodeAssert.equal(calls[0].context.autoCodeCheck, false); -}); - -test("runPipelineTest enables code check when the fixture expects run_code_check", async () => { - const calls = []; - const agentService = { - async run(input) { - calls.push(input); - return { - toolCalls: [ - { tool: "build_model", status: "success" }, - { tool: "run_analysis", status: "success" }, - { tool: "run_code_check", status: "success" }, - ], - }; - }, - }; - - await runPipelineTest(agentService, { - id: "frame-static-basic#specific", - locale: "en", - messages: ["2-story single-bay steel frame"], - enabledSkillIds: ["frame", "opensees-static"], - expect: { - toolCalls: ["build_model", "run_analysis", "run_code_check"], - }, - }); - - nodeAssert.equal(calls.length, 1); - nodeAssert.equal(calls[0].context.autoCodeCheck, true); -}); - -test("runPipelineTest attaches pipeline results to assertion failures", async () => { - const agentService = { - async run() { - return { - toolCalls: [ - { tool: "build_model", status: "success" }, - { tool: "run_analysis", status: "success" }, - ], - }; - }, - }; - - let error; - try { - await runPipelineTest(agentService, { - id: "frame-static-basic#specific", - locale: "en", - messages: ["2-story single-bay steel frame"], - enabledSkillIds: ["frame", "opensees-static", "code-check-gb50017"], - expect: { - toolCalls: ["build_model", "run_analysis", "run_code_check"], - }, - }); - } catch (err) { - error = err; - } - - nodeAssert.ok(error.pipelineResult); - nodeAssert.deepEqual( - error.pipelineResult.toolCalls.map((call) => call.tool), - ["build_model", "run_analysis"], - ); -}); - -test("runPipelineTest asserts explicit pipeline success flags", async () => { - const agentService = { - async run() { - return { - success: false, - toolCalls: [ - { tool: "build_model", status: "success" }, - { tool: "run_analysis", status: "success" }, - ], - }; - }, - }; - - await nodeAssert.rejects( - () => runPipelineTest(agentService, { - id: "truss-static-basic#specific", - locale: "zh", - messages: ["三角桁架,跨度12m,高3m,节点荷载20kN,做静力分析"], - enabledSkillIds: ["truss", "opensees-static"], - expect: { - success: true, - toolCalls: ["build_model", "run_analysis"], - }, - }), - /expected pipeline success=true, got false/, - ); -}); - -test("runPipelineTest requires run_analysis when analysisSuccess is true", async () => { - const agentService = { - async run() { - return { - success: true, - toolCalls: [ - { tool: "build_model", status: "success" }, - ], - }; - }, - }; - - await nodeAssert.rejects( - () => runPipelineTest(agentService, { - id: "frame-pipeline-multi-bay-zh#legacy", - locale: "zh", - messages: ["3层2跨框架,层高3.3m,跨度5.4m和6m,每层楼面荷载15kN/m"], - expect: { - analysisSuccess: true, - }, - }), - /expected run_analysis to execute, but no run_analysis tool call was recorded/, - ); -}); diff --git a/tests/llm-integration/lib/real-llm-client.cjs b/tests/llm-integration/lib/real-llm-client.cjs deleted file mode 100644 index f8fa76c0..00000000 --- a/tests/llm-integration/lib/real-llm-client.cjs +++ /dev/null @@ -1,126 +0,0 @@ -const { createRequire } = require("node:module"); -const fs = require("node:fs"); -const path = require("node:path"); - -/** - * Create a real LLM client using the backend's @langchain/openai dependency. - * Uses `apiKey` (v1.x) parameter name. Reads config from process.env which - * must be set before calling this function. - * - * The returned client is wrapped with LLM call logging so that every - * invoke() call is recorded to .structureclaw/logs/llm-calls-test.jsonl. - * - * @param {object} context - Integration context with env vars - * @param {number} [temperature=0] - LLM temperature (0 for deterministic) - * @returns {import('@langchain/openai').ChatOpenAI | null} - */ -function createRealLlmClient(context, temperature = 0) { - const apiKey = context.env.LLM_API_KEY || process.env.LLM_API_KEY || ""; - if (!apiKey) { - return null; - } - - const backendRequire = createRequire( - path.join(context.rootDir, "backend", "package.json") - ); - const { ChatOpenAI } = backendRequire("@langchain/openai"); - - const model = new ChatOpenAI({ - model: context.env.LLM_MODEL || process.env.LLM_MODEL || undefined, - temperature, - timeout: parseInt(context.env.LLM_TIMEOUT_MS || process.env.LLM_TIMEOUT_MS || "90000", 10), - maxRetries: parseInt(context.env.LLM_MAX_RETRIES || process.env.LLM_MAX_RETRIES || "1", 10), - apiKey, - configuration: { - baseURL: context.env.LLM_BASE_URL || process.env.LLM_BASE_URL || undefined, - }, - }); - - return wrapWithLogging(model, context); -} - -/** - * Self-contained LLM call logger. Writes one JSON line per invoke() call - * to /.structureclaw/logs/llm-calls-test.jsonl — same format as the backend's - * LlmCallLogger so the CI artifact upload picks it up automatically. - */ -let _logStream = null; -let _logDisabled = false; -function ensureLogStream(rootDir) { - if (_logDisabled) return null; - if (_logStream) return _logStream; - if (process.env.LLM_LOG_ENABLED === "false") { _logDisabled = true; return null; } - try { - const dir = process.env.LLM_LOG_DIR || path.join(rootDir, ".structureclaw", "logs"); - fs.mkdirSync(dir, { recursive: true }); - _logStream = fs.createWriteStream(path.join(dir, "llm-calls-test.jsonl"), { flags: "a" }); - _logStream.on("error", () => { _logDisabled = true; _logStream = null; }); - return _logStream; - } catch { - _logDisabled = true; - return null; - } -} - -function wrapWithLogging(model, context) { - const stream = ensureLogStream(context.rootDir); - if (!stream) return model; - - const modelName = context.env.LLM_MODEL || process.env.LLM_MODEL || "unknown"; - const originalInvoke = model.invoke.bind(model); - - function safeStringify(val) { - if (typeof val === "string") return val; - try { - const result = JSON.stringify(val); - return result === undefined ? String(val) : result; - } catch { - return String(val); - } - } - - function writeLogEntry(entry) { - try { - stream.write(JSON.stringify(entry) + "\n"); - } catch { - // Non-blocking: never crash on log write failure. - } - } - - model.invoke = async function (input, options) { - const promptStr = safeStringify(input); - const start = Date.now(); - try { - const result = await originalInvoke(input, options); - const content = safeStringify(result.content); - writeLogEntry({ - timestamp: new Date().toISOString(), - model: modelName, - prompt: promptStr, - response: content, - promptChars: promptStr.length, - responseChars: content.length, - durationMs: Date.now() - start, - success: true, - }); - return result; - } catch (error) { - writeLogEntry({ - timestamp: new Date().toISOString(), - model: modelName, - prompt: promptStr, - response: null, - promptChars: promptStr.length, - responseChars: 0, - durationMs: Date.now() - start, - success: false, - error: String(error), - }); - throw error; - } - }; - - return model; -} - -module.exports = { createRealLlmClient }; diff --git a/tests/llm-integration/lib/reporting.cjs b/tests/llm-integration/lib/reporting.cjs deleted file mode 100644 index 492fa1cf..00000000 --- a/tests/llm-integration/lib/reporting.cjs +++ /dev/null @@ -1,47 +0,0 @@ -const fs = require("node:fs"); -const path = require("node:path"); - -/** - * Format a single test case summary for console output. - */ -function formatCaseSummary(testCase, observedTrace, status) { - const tools = (observedTrace.toolCalls || []) - .filter((call) => call.status === "success") - .map((call) => call.tool) - .join(" -> ") || "(none)"; - - return [ - ` ${status} ${testCase.id} [${testCase.category}/${testCase.variant}]`, - ` enabled: ${(observedTrace.enabledSkillIds || []).join(", ") || "(auto)"}`, - ` activated: ${(observedTrace.activatedSkillIds || []).join(", ") || "(none)"}`, - ` structural: ${observedTrace.structuralSkillId || "(none)"}`, - ` analysis: ${observedTrace.analysisSkillId || "(none)"}`, - ` tools: ${tools}` - ].join("\n"); -} - -/** - * Append a record to a JSON artifact file. - * Creates the file with an array if it doesn't exist. - */ -function appendArtifactRecord(outputPath, record) { - const dir = path.dirname(outputPath); - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }); - } - - let records = []; - if (fs.existsSync(outputPath)) { - try { - const parsed = JSON.parse(fs.readFileSync(outputPath, "utf-8")); - records = Array.isArray(parsed) ? parsed : []; - } catch (_) { - records = []; - } - } - - records.push(record); - fs.writeFileSync(outputPath, JSON.stringify(records, null, 2) + "\n"); -} - -module.exports = { formatCaseSummary, appendArtifactRecord }; diff --git a/tests/llm-integration/lib/reporting.test.cjs b/tests/llm-integration/lib/reporting.test.cjs deleted file mode 100644 index 875f0e4e..00000000 --- a/tests/llm-integration/lib/reporting.test.cjs +++ /dev/null @@ -1,65 +0,0 @@ -const test = require("node:test"); -const nodeAssert = require("node:assert/strict"); -const fs = require("node:fs"); -const path = require("node:path"); -const os = require("node:os"); - -const { formatCaseSummary, appendArtifactRecord } = require("./reporting.cjs"); - -test("formatCaseSummary produces expected multi-line output", () => { - const summary = formatCaseSummary( - { id: "frame-static-basic#specific", category: "pipeline", variant: "specific" }, - { - enabledSkillIds: ["frame", "opensees-static"], - activatedSkillIds: ["frame"], - structuralSkillId: "frame", - analysisSkillId: "opensees-static", - toolCalls: [{ tool: "build_model", status: "success" }, { tool: "run_analysis", status: "success" }] - }, - "PASS" - ); - - nodeAssert.ok(summary.includes("PASS frame-static-basic#specific")); - nodeAssert.ok(summary.includes("[pipeline/specific]")); - nodeAssert.ok(summary.includes("enabled: frame, opensees-static")); - nodeAssert.ok(summary.includes("structural: frame")); - nodeAssert.ok(summary.includes("tools: build_model -> run_analysis")); -}); - -test("formatCaseSummary handles empty tool calls", () => { - const summary = formatCaseSummary( - { id: "test", category: "extraction", variant: "legacy" }, - { enabledSkillIds: undefined, activatedSkillIds: [], toolCalls: [] }, - "PASS" - ); - - nodeAssert.ok(summary.includes("enabled: (auto)")); - nodeAssert.ok(summary.includes("tools: (none)")); -}); - -test("appendArtifactRecord writes and appends records", () => { - const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "reporting-test-")); - const filePath = path.join(tmpDir, "output.json"); - - appendArtifactRecord(filePath, { id: "case-1", status: "PASS" }); - appendArtifactRecord(filePath, { id: "case-2", status: "FAIL" }); - - const records = JSON.parse(fs.readFileSync(filePath, "utf-8")); - nodeAssert.equal(records.length, 2); - nodeAssert.equal(records[0].id, "case-1"); - nodeAssert.equal(records[1].status, "FAIL"); - - fs.rmSync(tmpDir, { recursive: true }); -}); - -test("appendArtifactRecord creates parent directories", () => { - const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "reporting-test-")); - const filePath = path.join(tmpDir, "nested", "dir", "output.json"); - - appendArtifactRecord(filePath, { id: "case-1", status: "PASS" }); - - const records = JSON.parse(fs.readFileSync(filePath, "utf-8")); - nodeAssert.equal(records.length, 1); - - fs.rmSync(tmpDir, { recursive: true }); -}); diff --git a/tests/llm-integration/lib/retry.js b/tests/llm-integration/lib/retry.js deleted file mode 100644 index 3464354e..00000000 --- a/tests/llm-integration/lib/retry.js +++ /dev/null @@ -1,47 +0,0 @@ -const MAX_ATTEMPTS = 3; // 1 initial + 2 retries - -function shouldRetryError(err) { - const message = err instanceof Error ? err.message : String(err || ""); - - if (!message) { - return false; - } - - return ( - /\b(408|409|425|429)\b/u.test(message) - || /\b5\d{2}\b/u.test(message) - || /rate limit/iu.test(message) - || /quota exceeded/iu.test(message) - || /temporarily unavailable/iu.test(message) - || /overloaded/iu.test(message) - || /timeout/iu.test(message) - || /timed out/iu.test(message) - || /ECONNRESET|ECONNREFUSED|ETIMEDOUT|EAI_AGAIN/u.test(message) - || /socket hang up/iu.test(message) - ); -} - -/** - * Retry an async function up to MAX_ATTEMPTS times. - * By default only transient upstream failures retry; runner-level callers can - * opt into retrying every case failure to absorb LLM output drift. - */ -async function withRetry(fn, label = "test", maxAttempts = MAX_ATTEMPTS, options = {}) { - const retryOnAnyError = options.retryOnAnyError === true; - - for (let attempt = 1; attempt <= maxAttempts; attempt++) { - try { - return await fn(); - } catch (err) { - if (attempt === maxAttempts || (!retryOnAnyError && !shouldRetryError(err))) { - throw err; - } - const msg = err instanceof Error ? err.message : String(err); - process.stdout.write( - ` [RETRY] ${label} (attempt ${attempt}/${maxAttempts}) — ${msg}\n` - ); - } - } -} - -module.exports = { withRetry, MAX_ATTEMPTS, shouldRetryError }; diff --git a/tests/llm-integration/lib/retry.test.cjs b/tests/llm-integration/lib/retry.test.cjs deleted file mode 100644 index 03dc4d78..00000000 --- a/tests/llm-integration/lib/retry.test.cjs +++ /dev/null @@ -1,47 +0,0 @@ -const test = require("node:test"); -const nodeAssert = require("node:assert/strict"); - -const { withRetry } = require("./retry.js"); - -test("withRetry does not retry deterministic assertion failures", async () => { - let attempts = 0; - - await nodeAssert.rejects(async () => { - await withRetry(async () => { - attempts += 1; - throw new Error('expected tool "run_code_check" in successful tool calls'); - }, "pipeline-case", 8); - }, /run_code_check/); - - nodeAssert.equal(attempts, 1); -}); - -test("withRetry retries transient upstream failures", async () => { - let attempts = 0; - - const result = await withRetry(async () => { - attempts += 1; - if (attempts < 3) { - throw new Error("429 rate limited"); - } - return "ok"; - }, "upstream-case", 8); - - nodeAssert.equal(result, "ok"); - nodeAssert.equal(attempts, 3); -}); - -test("withRetry can retry deterministic failures when case-level retries are enabled", async () => { - let attempts = 0; - - const result = await withRetry(async () => { - attempts += 1; - if (attempts < 3) { - throw new Error('did not expect "frameDimension" in criticalMissing, but it was present'); - } - return "ok"; - }, "llm-case", 3, { retryOnAnyError: true }); - - nodeAssert.equal(result, "ok"); - nodeAssert.equal(attempts, 3); -}); diff --git a/tests/llm-integration/lib/selection.cjs b/tests/llm-integration/lib/selection.cjs deleted file mode 100644 index 33350832..00000000 --- a/tests/llm-integration/lib/selection.cjs +++ /dev/null @@ -1,85 +0,0 @@ -function parseLlmIntegrationOptions(args) { - let category; - let family; - let skillId; - let variant; - let scenarioId; - let outputPath; - - for (let index = 0; index < args.length; index += 1) { - const current = args[index]; - - // Handle --key=value form - const eqIndex = current.indexOf("="); - if (current.startsWith("--") && eqIndex > 2) { - const key = current.slice(0, eqIndex); - const value = current.slice(eqIndex + 1); - - if (key === "--family" || key === "--skill") { - family = value; - skillId = value; - } else if (key === "--variant") { - if (value !== "auto") variant = value; - } else if (key === "--scenario") { - scenarioId = value; - } else if (key === "--output") { - outputPath = value; - } - continue; - } - - if (current === "--family" || current === "--skill") { - family = args[index + 1]; - skillId = args[index + 1]; - index += 1; - continue; - } - if (current === "--variant") { - const raw = args[index + 1]; - if (raw !== "auto") variant = raw; - index += 1; - continue; - } - if (current === "--scenario") { - scenarioId = args[index + 1]; - index += 1; - continue; - } - if (current === "--output") { - outputPath = args[index + 1]; - index += 1; - continue; - } - if (!current.startsWith("--") && category === undefined) { - category = current; - } - } - - return { category, family, skillId, variant, scenarioId, outputPath }; -} - -function filterLlmTestCases(testCases, options = {}) { - return testCases.filter((testCase) => { - if (options.category && testCase.category !== options.category) { - return false; - } - if (options.skillId && testCase.skillId !== options.skillId) { - return false; - } - if (options.family && testCase.family !== options.family) { - return false; - } - if (options.variant && testCase.variant !== options.variant) { - return false; - } - if (options.scenarioId && testCase.scenarioId !== options.scenarioId) { - return false; - } - return true; - }); -} - -module.exports = { - parseLlmIntegrationOptions, - filterLlmTestCases, -}; diff --git a/tests/llm-integration/lib/selection.test.cjs b/tests/llm-integration/lib/selection.test.cjs deleted file mode 100644 index 24542dd6..00000000 --- a/tests/llm-integration/lib/selection.test.cjs +++ /dev/null @@ -1,101 +0,0 @@ -const test = require("node:test"); -const nodeAssert = require("node:assert/strict"); - -const { parseLlmIntegrationOptions, filterLlmTestCases } = require("./selection.cjs"); - -test("parseLlmIntegrationOptions reads category and skill filters", () => { - const options = parseLlmIntegrationOptions(["extraction", "--skill", "frame"]); - - nodeAssert.deepEqual(options, { - category: "extraction", - family: "frame", - skillId: "frame", - variant: undefined, - scenarioId: undefined, - outputPath: undefined, - }); -}); - -test("parseLlmIntegrationOptions defaults filters to undefined", () => { - const options = parseLlmIntegrationOptions([]); - - nodeAssert.deepEqual(options, { - category: undefined, - family: undefined, - skillId: undefined, - variant: undefined, - scenarioId: undefined, - outputPath: undefined, - }); -}); - -test("filterLlmTestCases narrows by category and skillId", () => { - const cases = [ - { id: "frame-extraction", category: "extraction", skillId: "frame" }, - { id: "frame-pipeline", category: "pipeline", skillId: "frame" }, - { id: "beam-extraction", category: "extraction", skillId: "beam" }, - ]; - - const filtered = filterLlmTestCases(cases, { - category: "extraction", - skillId: "frame", - }); - - nodeAssert.deepEqual(filtered.map((item) => item.id), ["frame-extraction"]); -}); - -test("parseLlmIntegrationOptions reads family, variant, scenario and output filters", () => { - const options = parseLlmIntegrationOptions([ - "pipeline", - "--family", "frame", - "--variant", "specific", - "--scenario", "frame-static-basic", - "--output", "tests/.artifacts/frame.json" - ]); - - nodeAssert.deepEqual(options, { - category: "pipeline", - family: "frame", - skillId: "frame", - variant: "specific", - scenarioId: "frame-static-basic", - outputPath: "tests/.artifacts/frame.json" - }); -}); - -test("filterLlmTestCases narrows by family, variant and scenarioId", () => { - const cases = [ - { id: "frame-static-basic#specific", family: "frame", variant: "specific", scenarioId: "frame-static-basic", category: "pipeline" }, - { id: "frame-static-basic#generic", family: "frame", variant: "generic", scenarioId: "frame-static-basic", category: "pipeline" }, - { id: "beam-basic#specific", family: "beam", variant: "specific", scenarioId: "beam-basic", category: "pipeline" } - ]; - - const filtered = filterLlmTestCases(cases, { - category: "pipeline", - family: "frame", - variant: "generic", - scenarioId: "frame-static-basic" - }); - - nodeAssert.deepEqual(filtered.map((item) => item.id), ["frame-static-basic#generic"]); -}); - -test("parseLlmIntegrationOptions handles --key=value form", () => { - const options = parseLlmIntegrationOptions(["--family=beam", "--variant=specific"]); - - nodeAssert.equal(options.family, "beam"); - nodeAssert.equal(options.skillId, "beam"); - nodeAssert.equal(options.variant, "specific"); -}); - -test("parseLlmIntegrationOptions treats --variant auto as no filter", () => { - const options = parseLlmIntegrationOptions(["--variant", "auto"]); - - nodeAssert.equal(options.variant, undefined); -}); - -test("parseLlmIntegrationOptions treats --variant=auto as no filter", () => { - const options = parseLlmIntegrationOptions(["--variant=auto"]); - - nodeAssert.equal(options.variant, undefined); -}); diff --git a/tests/llm-integration/lib/server.js b/tests/llm-integration/lib/server.js deleted file mode 100644 index d3e7c7c5..00000000 --- a/tests/llm-integration/lib/server.js +++ /dev/null @@ -1,48 +0,0 @@ -const { createRequire } = require("node:module"); -const path = require("node:path"); -const { pathToFileURL } = require("node:url"); - -/** - * Create a lightweight Fastify server with real backend routes for integration tests. - * Uses a temporary SQLite database and real LLM configuration. - * - * @param {object} context - Integration context with env vars - * @returns {Promise<{ app: import('fastify').FastifyInstance, close: () => Promise }>} - */ -async function createTestServer(context) { - const backendRequire = createRequire( - path.join(context.rootDir, "backend", "package.json") - ); - const Fastify = backendRequire("fastify"); - - const app = Fastify({ bodyLimit: 20 * 1024 * 1024 }); - - // Apply env vars so the backend config module picks them up - for (const [key, value] of Object.entries(context.env)) { - if (value !== undefined && value !== "") { - process.env[key] = value; - } - } - - // Register real backend routes - const { agentRoutes } = await import( - pathToFileURL(path.join(context.rootDir, "backend", "dist", "api", "agent.js")).href - ); - const { chatRoutes } = await import( - pathToFileURL(path.join(context.rootDir, "backend", "dist", "api", "chat.js")).href - ); - - await app.register(agentRoutes, { prefix: "/api/v1/agent" }); - await app.register(chatRoutes, { prefix: "/api/v1/chat" }); - - await app.ready(); - - return { - app, - async close() { - await app.close(); - }, - }; -} - -module.exports = { createTestServer }; diff --git a/tests/llm-integration/lib/summarize.test.cjs b/tests/llm-integration/lib/summarize.test.cjs deleted file mode 100644 index 0ff8722e..00000000 --- a/tests/llm-integration/lib/summarize.test.cjs +++ /dev/null @@ -1,29 +0,0 @@ -const test = require("node:test"); -const nodeAssert = require("node:assert/strict"); - -const { summarizeArtifacts } = require("../summarize.cjs"); - -test("summarizeArtifacts groups pass rate by family and variant", () => { - const summary = summarizeArtifacts([ - { family: "frame", variant: "specific", status: "PASS" }, - { family: "frame", variant: "specific", status: "FAIL" }, - { family: "frame", variant: "generic", status: "PASS" } - ]); - - nodeAssert.deepEqual(summary.frame.specific, { passed: 1, failed: 1, total: 2 }); - nodeAssert.deepEqual(summary.frame.generic, { passed: 1, failed: 0, total: 1 }); -}); - -test("summarizeArtifacts handles missing family/variant", () => { - const summary = summarizeArtifacts([ - { status: "PASS" } - ]); - - nodeAssert.ok(summary.unknown); - nodeAssert.deepEqual(summary.unknown.unknown, { passed: 1, failed: 0, total: 1 }); -}); - -test("summarizeArtifacts handles empty records", () => { - const summary = summarizeArtifacts([]); - nodeAssert.deepEqual(summary, {}); -}); diff --git a/tests/llm-integration/lib/trace.cjs b/tests/llm-integration/lib/trace.cjs deleted file mode 100644 index 2e463e39..00000000 --- a/tests/llm-integration/lib/trace.cjs +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Resolve the observed trace from test execution results. - * Normalizes draft and pipeline observations into a unified structure. - */ -function resolveObservedTrace({ testCase, draftResult, pipelineResult }) { - if (pipelineResult) { - return { - enabledSkillIds: testCase.enabledSkillIds, - selectedSkillIds: pipelineResult.routing?.selectedSkillIds || testCase.enabledSkillIds || [], - activatedSkillIds: pipelineResult.routing?.activatedSkillIds || [], - structuralSkillId: pipelineResult.routing?.structuralSkillId, - analysisSkillId: pipelineResult.routing?.analysisSkillId, - toolCalls: pipelineResult.toolCalls || [] - }; - } - return { - enabledSkillIds: testCase.enabledSkillIds, - selectedSkillIds: testCase.enabledSkillIds || [], - activatedSkillIds: [], - structuralSkillId: draftResult?.structuralTypeMatch?.skillId || draftResult?.stateToPersist?.skillId, - analysisSkillId: undefined, - toolCalls: [] - }; -} - -module.exports = { resolveObservedTrace }; diff --git a/tests/llm-integration/lib/trace.test.cjs b/tests/llm-integration/lib/trace.test.cjs deleted file mode 100644 index de497a34..00000000 --- a/tests/llm-integration/lib/trace.test.cjs +++ /dev/null @@ -1,50 +0,0 @@ -const test = require("node:test"); -const nodeAssert = require("node:assert/strict"); - -const { resolveObservedTrace } = require("./trace.cjs"); - -test("resolveObservedTrace normalizes draft observations", () => { - const trace = resolveObservedTrace({ - testCase: { enabledSkillIds: ["frame", "opensees-static"], fallbackPolicy: "forbid-generic" }, - draftResult: { structuralTypeMatch: { skillId: "frame" }, inferredType: "frame", extractionMode: "llm" } - }); - - nodeAssert.equal(trace.structuralSkillId, "frame"); - nodeAssert.deepEqual(trace.enabledSkillIds, ["frame", "opensees-static"]); - nodeAssert.deepEqual(trace.selectedSkillIds, ["frame", "opensees-static"]); - nodeAssert.equal(trace.analysisSkillId, undefined); - nodeAssert.deepEqual(trace.toolCalls, []); -}); - -test("resolveObservedTrace normalizes pipeline observations", () => { - const trace = resolveObservedTrace({ - testCase: { enabledSkillIds: ["frame", "opensees-static"] }, - pipelineResult: { - routing: { - selectedSkillIds: ["frame", "opensees-static"], - activatedSkillIds: ["frame", "opensees-static", "validation-structure-model"], - structuralSkillId: "frame", - analysisSkillId: "opensees-static" - }, - toolCalls: [ - { tool: "build_model", status: "success", authorizedBySkillIds: ["frame"] }, - { tool: "run_analysis", status: "success", authorizedBySkillIds: ["opensees-static"] } - ] - } - }); - - nodeAssert.equal(trace.structuralSkillId, "frame"); - nodeAssert.equal(trace.analysisSkillId, "opensees-static"); - nodeAssert.deepEqual(trace.activatedSkillIds, ["frame", "opensees-static", "validation-structure-model"]); - nodeAssert.equal(trace.toolCalls.length, 2); -}); - -test("resolveObservedTrace handles missing draft result", () => { - const trace = resolveObservedTrace({ - testCase: { enabledSkillIds: undefined } - }); - - nodeAssert.equal(trace.structuralSkillId, undefined); - nodeAssert.deepEqual(trace.enabledSkillIds, undefined); - nodeAssert.deepEqual(trace.selectedSkillIds, []); -}); diff --git a/tests/llm-integration/runner.cjs b/tests/llm-integration/runner.cjs deleted file mode 100644 index 232ae18d..00000000 --- a/tests/llm-integration/runner.cjs +++ /dev/null @@ -1,224 +0,0 @@ -const path = require("node:path"); -const { pathToFileURL } = require("node:url"); - -const { resolveIntegrationContext } = require("./lib/context.js"); -const { createRealLlmClient } = require("./lib/real-llm-client.cjs"); -const { withRetry, MAX_ATTEMPTS } = require("./lib/retry.js"); -const { loadLlmFixtures } = require("./lib/discovery.cjs"); -const { parseLlmIntegrationOptions, filterLlmTestCases } = require("./lib/selection.cjs"); -const { - assertRoutingTrace, - assertToolAuthorizers, -} = require("./lib/assertions.js"); -const { - runRoutingTest, - runExtractionTest, - runPipelineTest, - runClarificationTest, -} = require("./lib/executors.cjs"); -const { resolveObservedTrace } = require("./lib/trace.cjs"); -const { formatCaseSummary, appendArtifactRecord } = require("./lib/reporting.cjs"); - -/** Import AgentSkillRuntime from backend dist. */ -async function importAgentSkillRuntime(rootDir) { - const filePath = path.join(rootDir, "backend", "dist", "agent-runtime", "index.js"); - const mod = await import(pathToFileURL(filePath).href); - return mod.AgentSkillRuntime; -} - -/** Import and instantiate LangGraphAgentService with real LLM. */ -async function createAgentService(rootDir, skillRuntime) { - const filePath = path.join(rootDir, "backend", "dist", "agent-langgraph", "agent-service.js"); - const mod = await import(`${pathToFileURL(filePath).href}?llm-test=${Date.now()}`); - const LangGraphAgentService = mod.LangGraphAgentService; - return new LangGraphAgentService(skillRuntime); -} - -// --------------------------------------------------------------------------- -// Main runner -// --------------------------------------------------------------------------- - -async function runLlmIntegrationTests(rootDir, args) { - const maxAttempts = MAX_ATTEMPTS; - const context = resolveIntegrationContext(rootDir); - const options = parseLlmIntegrationOptions(args); - - // Inject LLM env vars into process.env BEFORE importing backend modules. - // The backend config module reads process.env at import time. - for (const [k, v] of Object.entries(context.env)) { - if (v !== undefined && v !== "") { - process.env[k] = v; - } - } - - // Ensure backend is built - const { runBackendBuildOnce } = require("../regression/shared.js"); - await runBackendBuildOnce(context); - - // Ensure DB is ready - const { execSync } = require("node:child_process"); - execSync("npx prisma db push --accept-data-loss", { - cwd: path.join(rootDir, "backend"), - env: { ...process.env, ...context.env }, - stdio: "pipe", - }); - - // Load test cases — default to routing-only since extraction/pipeline/clarification - // categories depend on the legacy AgentService API (textToModelDraft etc.) - // which no longer exists under the LangGraph ReAct architecture. - const effectiveCategory = options.category || "routing"; - const allCases = loadLlmFixtures(rootDir); - const cases = filterLlmTestCases(allCases, { ...options, category: effectiveCategory }); - - if (cases.length === 0) { - process.stdout.write("No test cases matched.\n"); - return; - } - - process.stdout.write(`\n${"=".repeat(60)}\n`); - process.stdout.write(`LLM Integration Tests: ${cases.length} cases\n`); - process.stdout.write(`Model: ${context.env.LLM_MODEL || "(default)"}\n`); - process.stdout.write(`Base URL: ${context.env.LLM_BASE_URL || "(default)"}\n`); - process.stdout.write(`Category: ${options.category || "(all)"}\n`); - process.stdout.write(`Skill: ${options.skillId || "(all)"}\n`); - process.stdout.write(`Family: ${options.family || "(all)"}\n`); - process.stdout.write(`Variant: ${options.variant || "(all)"}\n`); - process.stdout.write(`${"=".repeat(60)}\n\n`); - - // Create LLM client and runtime - const llm = createRealLlmClient(context, 0); - const AgentSkillRuntime = await importAgentSkillRuntime(rootDir); - const runtime = new AgentSkillRuntime(); - - let agentService = null; - - const results = { passed: 0, failed: 0, retried: 0, failures: [] }; - const startTime = Date.now(); - - for (const testCase of cases) { - const caseStart = Date.now(); - let draftResult = null; - let pipelineResult = null; - - try { - await withRetry(async () => { - switch (testCase.category) { - case "routing": - await runRoutingTest(runtime, testCase); - break; - case "extraction": - if (!llm) throw new Error("LLM client not available"); - draftResult = await runExtractionTest(runtime, llm, testCase); - break; - case "pipeline": { - if (!agentService) { - agentService = await createAgentService(rootDir, runtime); - } - pipelineResult = await runPipelineTest(agentService, testCase); - break; - } - case "clarification": - if (!llm) throw new Error("LLM client not available"); - draftResult = await runClarificationTest(runtime, llm, testCase); - break; - default: - throw new Error(`Unknown test category: ${testCase.category}`); - } - }, testCase.id, maxAttempts, { retryOnAnyError: true }); - - // Resolve observed trace - const observedTrace = resolveObservedTrace({ - testCase, - draftResult: draftResult || undefined, - pipelineResult: pipelineResult || undefined, - }); - - // Assert routing trace expectations - const expect = testCase.expect || {}; - if (expect.routing) { - assertRoutingTrace(observedTrace, expect.routing); - } - if (expect.toolAuthorizers) { - assertToolAuthorizers(observedTrace.toolCalls || [], expect.toolAuthorizers); - } - - // Check fallback policy - if (testCase.fallbackPolicy === "forbid-generic" && observedTrace.structuralSkillId === "generic") { - throw new Error(`unexpected generic fallback for ${testCase.id}`); - } - if (testCase.fallbackPolicy === "require-generic" && observedTrace.structuralSkillId !== "generic") { - throw new Error(`expected generic fallback for ${testCase.id}`); - } - - const duration = Date.now() - caseStart; - process.stdout.write(`${formatCaseSummary(testCase, observedTrace, "PASS")}\n`); - - if (options.outputPath) { - appendArtifactRecord(options.outputPath, { - id: testCase.id, - category: testCase.category, - variant: testCase.variant, - family: testCase.family, - enabledSkillIds: observedTrace.enabledSkillIds, - activatedSkillIds: observedTrace.activatedSkillIds, - structuralSkillId: observedTrace.structuralSkillId, - analysisSkillId: observedTrace.analysisSkillId, - toolCalls: observedTrace.toolCalls, - status: "PASS", - durationMs: duration, - }); - } - - results.passed += 1; - } catch (err) { - draftResult = draftResult || err?.draftResult || null; - pipelineResult = pipelineResult || err?.pipelineResult || null; - const duration = Date.now() - caseStart; - const message = err instanceof Error ? err.message : String(err); - - const failTrace = resolveObservedTrace({ - testCase, - draftResult: draftResult || undefined, - pipelineResult: pipelineResult || undefined, - }); - process.stdout.write(`${formatCaseSummary(testCase, failTrace, "FAIL")}\n`); - process.stdout.write(` error: ${message}\n`); - - if (options.outputPath) { - appendArtifactRecord(options.outputPath, { - id: testCase.id, - category: testCase.category, - variant: testCase.variant, - family: testCase.family, - enabledSkillIds: failTrace.enabledSkillIds, - activatedSkillIds: failTrace.activatedSkillIds, - structuralSkillId: failTrace.structuralSkillId, - analysisSkillId: failTrace.analysisSkillId, - toolCalls: failTrace.toolCalls, - status: "FAIL", - durationMs: duration, - error: message, - }); - } - - results.failed += 1; - results.failures.push({ id: testCase.id, error: message }); - } - } - - // Summary - const totalDuration = Date.now() - startTime; - process.stdout.write(`\n${"=".repeat(60)}\n`); - process.stdout.write(`Results: ${results.passed}/${cases.length} passed, ${results.failed} failed\n`); - if (results.failures.length > 0) { - process.stdout.write(`Failed: ${results.failures.map((f) => f.id).join(", ")}\n`); - } - process.stdout.write(`Total time: ${(totalDuration / 1000).toFixed(1)}s\n`); - process.stdout.write(`${"=".repeat(60)}\n\n`); - - if (results.failed > 0) { - process.exitCode = 1; - } -} - -module.exports = { runLlmIntegrationTests }; diff --git a/tests/llm-integration/summarize.cjs b/tests/llm-integration/summarize.cjs deleted file mode 100644 index 7ccc99c3..00000000 --- a/tests/llm-integration/summarize.cjs +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Summarize artifact records by grouping pass/fail rate by family and variant. - */ -function summarizeArtifacts(records) { - return records.reduce((acc, record) => { - const family = record.family || "unknown"; - const variant = record.variant || "unknown"; - acc[family] = acc[family] || {}; - acc[family][variant] = acc[family][variant] || { passed: 0, failed: 0, total: 0 }; - const bucket = acc[family][variant]; - bucket.total += 1; - if (record.status === "PASS") bucket.passed += 1; - else bucket.failed += 1; - return acc; - }, {}); -} - -/** - * Print a summary table to stdout. - */ -function printSummary(summary) { - for (const [family, variants] of Object.entries(summary)) { - process.stdout.write(`\n${family}:\n`); - for (const [variant, stats] of Object.entries(variants)) { - process.stdout.write(` ${variant}: passed=${stats.passed}, failed=${stats.failed}, total=${stats.total}\n`); - } - } -} - -module.exports = { summarizeArtifacts, printSummary }; diff --git a/tests/regression/backend-regression.js b/tests/regression/backend-regression.js index a48c6195..d9be58c5 100644 --- a/tests/regression/backend-regression.js +++ b/tests/regression/backend-regression.js @@ -20,6 +20,7 @@ const BACKEND_STEPS = [ ["Chat stream contract regression", "validate-chat-stream-contract"], ["Chat message routing contract", "validate-chat-message-routing"], ["Report narrative contract", "validate-report-narrative-contract"], + ["Skill routing regression", "validate-skill-routing"], ]; const JEST_ENV_FORWARD_KEYS = [ diff --git a/tests/regression/backend-validations.js b/tests/regression/backend-validations.js index 239fd24a..833745ca 100644 --- a/tests/regression/backend-validations.js +++ b/tests/regression/backend-validations.js @@ -1559,6 +1559,42 @@ async function validateStructureJsonSkill(context) { console.log("[ok] validation module runtime exports"); } +async function validateSkillRouting(context) { + const { loadLlmFixtures } = require("../llm-integration/lib/discovery.cjs"); + const AgentSkillRuntime = await importAgentSkillRuntime(context.rootDir); + const runtime = new AgentSkillRuntime(); + + const allCases = loadLlmFixtures(context.rootDir); + const routingCases = allCases.filter((c) => c.category === "routing"); + + assert(routingCases.length > 0, "should have at least one routing fixture case"); + + let passed = 0; + for (const testCase of routingCases) { + const locale = testCase.locale === "zh" ? "zh" : "en"; + const message = testCase.messages[0]; + const match = await runtime.detectStructuralType(message, locale, undefined, testCase.enabledSkillIds); + const expected = testCase.expect || {}; + + if (expected.inferredType) { + const actualKey = match.mappedType || match.key; + assert( + actualKey === expected.inferredType || match.skillId === expected.inferredType, + `[${testCase.id}] expected inferredType="${expected.inferredType}", got key="${match.key}" mappedType="${match.mappedType}" skillId="${match.skillId}"` + ); + } + if (expected.structuralTypeKey) { + assert( + match.key === expected.structuralTypeKey || match.mappedType === expected.structuralTypeKey, + `[${testCase.id}] expected structuralTypeKey="${expected.structuralTypeKey}", got key="${match.key}" mappedType="${match.mappedType}"` + ); + } + passed += 1; + } + + console.log(`[ok] skill routing: ${passed}/${routingCases.length} cases passed`); +} + const BACKEND_VALIDATIONS = { "validate-agent-orchestration": validateAgentOrchestration, "validate-agent-base-chat-fallback": validateAgentBaseChatFallback, @@ -1580,6 +1616,7 @@ const BACKEND_VALIDATIONS = { "validate-report-narrative-contract": validateReportNarrativeContract, "validate-dev-startup-guards": validateDevStartupGuards, "validate-structure-json-skill": validateStructureJsonSkill, + "validate-skill-routing": validateSkillRouting, }; async function runBackendValidation(name, context) { diff --git a/tests/runner.mjs b/tests/runner.mjs index 2197db55..1e807f0b 100644 --- a/tests/runner.mjs +++ b/tests/runner.mjs @@ -11,8 +11,6 @@ const { runBackendRegression } = require("./regression/backend-regression.js"); const { runAnalysisRegression } = require("./regression/analysis-regression.js"); const { runNativeInstallSmoke } = require("./smoke/install-smoke.cjs"); -const { runLlmIntegrationTests } = require("./llm-integration/runner.cjs"); -const { summarizeArtifacts, printSummary } = require("./llm-integration/summarize.cjs"); const { runBenchmark } = require("./llm-benchmark/runner.cjs"); function parseCliOptions(args) { @@ -73,22 +71,15 @@ Commands: validate --list List named validations check Run a grouped validation alias check --list List grouped validation aliases - backend-regression Backend regression bundle: build, lint, Jest, and validations + backend-regression Backend regression bundle: build, lint, Jest, and validations (includes skill routing) analysis-regression Deterministic engineering analysis regression - llm-integration Legacy LLM/routing integration tests (requires LLM_API_KEY) - supports: node tests/runner.mjs llm-integration [category] - default: routing - categories: routing | extraction | pipeline | clarification - [--family ] (alias: --skill) - [--variant ] - [--scenario ] - [--output ] - llm-benchmark LangGraph agent benchmark (requires LLM_API_KEY) - runs the full ReAct agent and evaluates scenario quality + llm-benchmark LangGraph agent benchmark with v2 assertions (requires LLM_API_KEY) + runs full ReAct agent end-to-end with skill-hit tracing and LLM-as-Judge + assertion types: structural_type | has_model | has_analysis | has_report | + skill_match | natural_language [--scenario ] [--output ] - llm-summary Summarize LLM test artifacts by family/variant - smoke-native Native install/build compatibility smoke + smoke-native CI-style native install smoke (npm ci + build) Replaces former sclaw commands: sclaw validate ... -> node tests/runner.mjs validate ... @@ -150,31 +141,9 @@ async function main() { case "smoke-native": await runNativeInstallSmoke(rootDir); return; - case "llm-integration": - await runLlmIntegrationTests(rootDir, rawArgs); - return; case "llm-benchmark": await runBenchmark(rootDir, rawArgs); return; - case "llm-summary": { - const artifactPath = rawArgs[0]; - if (!artifactPath) { - throw new Error("Usage: node tests/runner.mjs llm-summary "); - } - const fs = require("node:fs"); - if (!fs.existsSync(artifactPath)) { - throw new Error(`Artifact file not found: ${artifactPath}`); - } - const parsed = JSON.parse(fs.readFileSync(artifactPath, "utf-8")); - if (!Array.isArray(parsed)) { - throw new Error(`Expected a JSON array in ${artifactPath}, got ${typeof parsed}`); - } - const records = parsed; - const summary = summarizeArtifacts(records); - printSummary(summary); - process.stdout.write("\n"); - return; - } default: throw new Error(`Unknown command: ${cmd}`); }