ZhouChaunge · ZhouChaunge · May 7, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -39,23 +39,27 @@ jobs:
     steps:
       - name: Get PR Reference
         id: get-ref
-        run: |
-          set -euo pipefail
-          if [ "${{ github.event_name }}" == "issue_comment" ]; then
-            PR_URL="${{ github.event.issue.pull_request.url }}"
-            SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha)
-            if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then
-              echo "::error::Failed to resolve PR head SHA"
-              exit 1
-            fi
-            echo "sha=$SHA" >> "$GITHUB_OUTPUT"
-            echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
-            echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT"
-          else
-            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
-            echo "issue_number=" >> "$GITHUB_OUTPUT"
-            echo "comment_user=" >> "$GITHUB_OUTPUT"
-          fi
+        uses: actions/github-script@v9
+        with:
+          script: |
+            const eventName = context.eventName;
+            if (eventName === 'issue_comment') {
+              const [owner, repo] = context.payload.repository.full_name.split('/');
+              const prNumber = context.payload.issue.number;
+              const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber });
+              const sha = pr.head?.sha;
+              if (!sha) {
+                core.setFailed('Failed to resolve PR head SHA');
+                return;
+              }
+              core.setOutput('sha', sha);
+              core.setOutput('issue_number', String(context.issue.number));
+              core.setOutput('comment_user', context.payload.comment.user.login);
+            } else {
+              core.setOutput('sha', '${{ github.sha }}');
+              core.setOutput('issue_number', '');
+              core.setOutput('comment_user', '');
+            }
 
       - name: Reply with Action Link
         if: github.event_name == 'issue_comment'
@@ -176,15 +180,20 @@ jobs:
         if: always() && needs.check-permission.outputs.trigger_issue_number != ''
         continue-on-error: true
         uses: actions/github-script@v9
+        env:
+          ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }}
+          COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }}
+          TEST_OUTCOME: ${{ steps.run-tests.outcome }}
+          SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }}
         with:
           github-token: ${{ secrets.COMMENT_TOKEN }}
           script: |
-            const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}');
-            const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}';
-            const success = '${{ steps.run-tests.outcome }}' === 'success';
+            const issueNumber = Number(process.env.ISSUE_NUMBER);
+            const commentUser = process.env.COMMENT_USER;
+            const success = process.env.TEST_OUTCOME === 'success';
+            const summary = JSON.parse(process.env.SUMMARY_JSON || '""');
             const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
 
-            const summary = ${{ toJson(steps.summary.outputs.summary) }};
             const icon = success ? '✅' : '❌';
             const status = success ? 'PASSED' : 'FAILED';
             const body = [

diff --git a/.github/workflows/llm-integration.yml → .github/workflows/llm-benchmark.yml b/.github/workflows/llm-integration.yml → .github/workflows/llm-benchmark.yml
@@ -1,4 +1,4 @@
-name: LLM Integration Tests
+name: LLM Benchmark
 
 on:
   workflow_dispatch:
@@ -7,24 +7,12 @@ on:
         description: "LLM model name (leave empty for default)"
         required: false
         default: ""
-      filter:
-        description: "Test category filter (routing|extraction|pipeline|clarification)"
+      scenario:
+        description: "Run a single scenario by ID (leave empty for all)"
         required: false
         default: ""
   issue_comment:
     types: [created]
-  push:
-    branches:
-      - master
-    paths:
-      - 'backend/**'
-      - 'scripts/**'
-      - 'tests/**'
-      - 'sclaw'
-      - 'sclaw_cn'
-      - 'package.json'
-      - '.github/workflows/llm-integration.yml'
-
 permissions:
   contents: read
 
@@ -36,41 +24,44 @@ jobs:
       issues: write
     if: |
       github.event_name == 'workflow_dispatch' ||
-      github.event_name == 'push' ||
       (github.event.issue.pull_request &&
-       contains(github.event.comment.body, '/test-llm') &&
+       contains(github.event.comment.body, '/test-llm-benchmark') &&
        (github.event.comment.user.login == 'guyi2000' || github.event.comment.user.login == 'qinsz01'))
     runs-on: ubuntu-latest
     outputs:
       pr_sha: ${{ steps.get-ref.outputs.sha }}
-      filter_arg: ${{ steps.get-ref.outputs.filter_arg }}
+      scenario_arg: ${{ steps.get-ref.outputs.scenario_arg }}
       trigger_issue_number: ${{ steps.get-ref.outputs.issue_number }}
       trigger_comment_user: ${{ steps.get-ref.outputs.comment_user }}
     steps:
       - name: Get PR Reference
         id: get-ref
-        env:
-          COMMENT_BODY: ${{ github.event.comment.body }}
-        run: |
-          set -euo pipefail
-          if [ "${{ github.event_name }}" == "issue_comment" ]; then
-            PR_URL="${{ github.event.issue.pull_request.url }}"
-            SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha)
-            if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then
-              echo "::error::Failed to resolve PR head SHA"
-              exit 1
-            fi
-            echo "sha=$SHA" >> "$GITHUB_OUTPUT"
-            FILTER=$(echo "$COMMENT_BODY" | sed -n 's|/test-llm[[:space:]]*\(.*\)|\1|p' | xargs)
-            echo "filter_arg=$FILTER" >> "$GITHUB_OUTPUT"
-            echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
-            echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT"
-          else
-            echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
-            echo "filter_arg=${{ github.event.inputs.filter }}" >> "$GITHUB_OUTPUT"
-            echo "issue_number=" >> "$GITHUB_OUTPUT"
-            echo "comment_user=" >> "$GITHUB_OUTPUT"
-          fi
+        uses: actions/github-script@v9
+        with:
+          script: |
+            const eventName = context.eventName;
+            if (eventName === 'issue_comment') {
+              const [owner, repo] = context.payload.repository.full_name.split('/');
+              const prNumber = context.payload.issue.number;
+              const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber });
+              const sha = pr.head?.sha;
+              if (!sha) {
+                core.setFailed('Failed to resolve PR head SHA');
+                return;
+              }
+              const body = context.payload.comment.body || '';
+              const match = body.match(/\/test-llm-benchmark\s*([a-zA-Z0-9_-]*)/);
+              const scenario = match ? match[1] : '';
+              core.setOutput('sha', sha);
+              core.setOutput('scenario_arg', scenario);
+              core.setOutput('issue_number', String(context.issue.number));
+              core.setOutput('comment_user', context.payload.comment.user.login);
+            } else {
+              core.setOutput('sha', '${{ github.sha }}');
+              core.setOutput('scenario_arg', '${{ github.event.inputs.scenario }}');
+              core.setOutput('issue_number', '');
+              core.setOutput('comment_user', '');
+            }
 
       - name: Reply with Action Link
         if: github.event_name == 'issue_comment'
@@ -81,9 +72,9 @@ jobs:
           script: |
             const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
             const body = [
-              `✅ **LLM Integration Tests Triggered!**`,
+              `✅ **LLM Benchmark Triggered!**`,
               ``,
-              `@${context.payload.comment.user.login}, I've started the workflow for you.`,
+              `@${context.payload.comment.user.login}, I've started the benchmark run.`,
               `Please click the link below to monitor progress and **approve the deployment** to the test environment:`,
               ``,
               `🚀 **[View Action Run Details](${runUrl})**`,
@@ -96,17 +87,17 @@ jobs:
               body: body
             })
 
-  llm-integration:
+  llm-benchmark:
     needs: check-permission
     permissions:
       contents: read
       issues: write
     environment: test
     runs-on: ubuntu-latest
     concurrency:
-      group: llm-integration-${{ github.event.issue.number || github.sha }}
+      group: llm-benchmark-${{ github.event.issue.number || github.sha }}
       cancel-in-progress: true
-    timeout-minutes: 60
+    timeout-minutes: 90
     env:
       NEXT_TELEMETRY_DISABLED: 1
     steps:
@@ -133,33 +124,44 @@ jobs:
       - name: Build via sclaw
         run: node ./sclaw build
 
-      - name: Run LLM integration tests
-        id: run-tests
+      - name: Run LLM benchmark
+        id: run-benchmark
         env:
           LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
           LLM_MODEL: ${{ github.event.inputs.llm_model || vars.LLM_MODEL }}
           LLM_BASE_URL: ${{ vars.LLM_BASE_URL }}
+          LLM_JUDGE_API_KEY: ${{ secrets.LLM_JUDGE_API_KEY || secrets.LLM_API_KEY }}
+          LLM_JUDGE_MODEL: ${{ vars.LLM_JUDGE_MODEL }}
           LLM_LOG_ENABLED: "true"
           LLM_LOG_DIR: ${{ github.workspace }}/.structureclaw/logs
-          DATABASE_URL: "file:.structureclaw/data/structureclaw-llm-ci.db"
+          DATABASE_URL: "file:.structureclaw/data/structureclaw-benchmark-ci.db"
         run: |
           set -o pipefail
-          FILTER_ARG="${{ needs.check-permission.outputs.filter_arg }}"
-          if [ -n "$FILTER_ARG" ]; then
-            node tests/runner.mjs llm-integration --filter "$FILTER_ARG" 2>&1 | tee test-output.txt
+          SCENARIO_ARG="${{ needs.check-permission.outputs.scenario_arg }}"
+          if [ -n "$SCENARIO_ARG" ]; then
+            node tests/runner.mjs llm-benchmark --scenario "$SCENARIO_ARG" --output benchmark-results.json 2>&1 | tee test-output.txt
           else
-            node tests/runner.mjs llm-integration 2>&1 | tee test-output.txt
+            node tests/runner.mjs llm-benchmark --output benchmark-results.json 2>&1 | tee test-output.txt
           fi
 
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: benchmark-results
+          path: |
+            benchmark-results.json
+            .structureclaw/logs/
+
       - name: Extract test summary
         if: always()
         id: summary
         run: |
           if [ ! -f test-output.txt ]; then
-            echo "summary=Tests did not produce output." >> "$GITHUB_OUTPUT"
+            echo "summary=Benchmark did not produce output." >> "$GITHUB_OUTPUT"
             exit 0
           fi
-          SUMMARY=$(grep -B 1 -A 3 '^Results:' test-output.txt || echo "No summary found")
+          SUMMARY=$(grep -B 1 -A 5 '^Benchmark Results:' test-output.txt || echo "No summary found")
           echo "summary<<EOF" >> "$GITHUB_OUTPUT"
           echo "$SUMMARY" >> "$GITHUB_OUTPUT"
           echo "EOF" >> "$GITHUB_OUTPUT"
@@ -168,27 +170,32 @@ jobs:
         if: always() && needs.check-permission.outputs.trigger_issue_number != ''
         continue-on-error: true
         uses: actions/github-script@v9
+        env:
+          ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }}
+          COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }}
+          BENCHMARK_OUTCOME: ${{ steps.run-benchmark.outcome }}
+          SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }}
         with:
           github-token: ${{ secrets.COMMENT_TOKEN }}
           script: |
-            const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}');
-            const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}';
-            const success = '${{ steps.run-tests.outcome }}' === 'success';
-            const summary = ${{ toJson(steps.summary.outputs.summary) }};
+            const issueNumber = Number(process.env.ISSUE_NUMBER);
+            const commentUser = process.env.COMMENT_USER;
+            const success = process.env.BENCHMARK_OUTCOME === 'success';
+            const summary = JSON.parse(process.env.SUMMARY_JSON || '""');
             const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
 
             const icon = success ? '✅' : '❌';
             const status = success ? 'PASSED' : 'FAILED';
             const body = [
-              `${icon} **LLM Integration Tests ${status}**`,
+              `${icon} **LLM Benchmark ${status}**`,
               ``,
-              `@${commentUser}, the test run has completed.`,
+              `@${commentUser}, the benchmark run has completed.`,
               ``,
               `\`\`\``,
               summary,
               `\`\`\``,
               ``,
-              `📦 **[Download Logs](${runUrl})** or check the Artifacts section.`,
+              `📦 **[Download Results](${runUrl})** or check the Artifacts section.`,
             ].join('\n');
 
             await github.rest.issues.createComment({
@@ -197,12 +204,3 @@ jobs:
               repo: context.repo.repo,
               body: body
             })
-
-      - name: Upload LLM call logs
-        if: always()
-        uses: actions/upload-artifact@v7
-        with:
-          name: llm-logs-ubuntu
-          path: .structureclaw/logs/*.jsonl
-          retention-days: 7
-          if-no-files-found: ignore
diff --git a/AGENTS.md b/AGENTS.md
@@ -67,7 +67,8 @@ node tests/runner.mjs validate validate-agent-orchestration  # Agent orchestrati
 node tests/runner.mjs validate validate-chat-stream-contract # Chat stream contract
 node tests/runner.mjs validate validate-analyze-contract     # Analyze endpoint contract
 node tests/runner.mjs smoke-native                           # Full native install smoke (mirrors CI)
-node tests/runner.mjs llm-integration                        # LLM integration tests (needs LLM_API_KEY)
+node tests/runner.mjs llm-benchmark                          # LLM benchmark: v2 assertions, skill-trace, LLM-as-Judge (needs LLM_API_KEY)
+node tests/runner.mjs llm-benchmark --scenario <id>          # Run a single benchmark scenario
 ```
 
 ## Key Conventions