Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 30 additions & 21 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,27 @@ jobs:
steps:
- name: Get PR Reference
id: get-ref
run: |
set -euo pipefail
if [ "${{ github.event_name }}" == "issue_comment" ]; then
PR_URL="${{ github.event.issue.pull_request.url }}"
SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha)
if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then
echo "::error::Failed to resolve PR head SHA"
exit 1
fi
echo "sha=$SHA" >> "$GITHUB_OUTPUT"
echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT"
else
echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
echo "issue_number=" >> "$GITHUB_OUTPUT"
echo "comment_user=" >> "$GITHUB_OUTPUT"
fi
uses: actions/github-script@v9
with:
script: |
const eventName = context.eventName;
if (eventName === 'issue_comment') {
const [owner, repo] = context.payload.repository.full_name.split('/');
const prNumber = context.payload.issue.number;
const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber });
const sha = pr.head?.sha;
if (!sha) {
core.setFailed('Failed to resolve PR head SHA');
return;
}
core.setOutput('sha', sha);
core.setOutput('issue_number', String(context.issue.number));
core.setOutput('comment_user', context.payload.comment.user.login);
} else {
core.setOutput('sha', '${{ github.sha }}');
core.setOutput('issue_number', '');
core.setOutput('comment_user', '');
}

- name: Reply with Action Link
if: github.event_name == 'issue_comment'
Expand Down Expand Up @@ -176,15 +180,20 @@ jobs:
if: always() && needs.check-permission.outputs.trigger_issue_number != ''
continue-on-error: true
uses: actions/github-script@v9
env:
ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }}
COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }}
TEST_OUTCOME: ${{ steps.run-tests.outcome }}
SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }}
with:
github-token: ${{ secrets.COMMENT_TOKEN }}
script: |
const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}');
const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}';
const success = '${{ steps.run-tests.outcome }}' === 'success';
const issueNumber = Number(process.env.ISSUE_NUMBER);
const commentUser = process.env.COMMENT_USER;
const success = process.env.TEST_OUTCOME === 'success';
const summary = JSON.parse(process.env.SUMMARY_JSON || '""');
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;

const summary = ${{ toJson(steps.summary.outputs.summary) }};
const icon = success ? '✅' : '❌';
const status = success ? 'PASSED' : 'FAILED';
const body = [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: LLM Integration Tests
name: LLM Benchmark

on:
workflow_dispatch:
Expand All @@ -7,24 +7,12 @@ on:
description: "LLM model name (leave empty for default)"
required: false
default: ""
filter:
description: "Test category filter (routing|extraction|pipeline|clarification)"
scenario:
description: "Run a single scenario by ID (leave empty for all)"
required: false
default: ""
issue_comment:
types: [created]
push:
branches:
- master
paths:
- 'backend/**'
- 'scripts/**'
- 'tests/**'
- 'sclaw'
- 'sclaw_cn'
- 'package.json'
- '.github/workflows/llm-integration.yml'

permissions:
contents: read

Expand All @@ -36,41 +24,44 @@ jobs:
issues: write
if: |
github.event_name == 'workflow_dispatch' ||
github.event_name == 'push' ||
(github.event.issue.pull_request &&
contains(github.event.comment.body, '/test-llm') &&
contains(github.event.comment.body, '/test-llm-benchmark') &&
(github.event.comment.user.login == 'guyi2000' || github.event.comment.user.login == 'qinsz01'))
runs-on: ubuntu-latest
outputs:
pr_sha: ${{ steps.get-ref.outputs.sha }}
filter_arg: ${{ steps.get-ref.outputs.filter_arg }}
scenario_arg: ${{ steps.get-ref.outputs.scenario_arg }}
trigger_issue_number: ${{ steps.get-ref.outputs.issue_number }}
trigger_comment_user: ${{ steps.get-ref.outputs.comment_user }}
steps:
- name: Get PR Reference
id: get-ref
env:
COMMENT_BODY: ${{ github.event.comment.body }}
run: |
set -euo pipefail
if [ "${{ github.event_name }}" == "issue_comment" ]; then
PR_URL="${{ github.event.issue.pull_request.url }}"
SHA=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" "$PR_URL" | jq -r .head.sha)
if [ -z "$SHA" ] || [ "$SHA" = "null" ]; then
echo "::error::Failed to resolve PR head SHA"
exit 1
fi
echo "sha=$SHA" >> "$GITHUB_OUTPUT"
FILTER=$(echo "$COMMENT_BODY" | sed -n 's|/test-llm[[:space:]]*\(.*\)|\1|p' | xargs)
echo "filter_arg=$FILTER" >> "$GITHUB_OUTPUT"
echo "issue_number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
echo "comment_user=${{ github.event.comment.user.login }}" >> "$GITHUB_OUTPUT"
else
echo "sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
echo "filter_arg=${{ github.event.inputs.filter }}" >> "$GITHUB_OUTPUT"
echo "issue_number=" >> "$GITHUB_OUTPUT"
echo "comment_user=" >> "$GITHUB_OUTPUT"
fi
uses: actions/github-script@v9
with:
script: |
const eventName = context.eventName;
if (eventName === 'issue_comment') {
const [owner, repo] = context.payload.repository.full_name.split('/');
const prNumber = context.payload.issue.number;
const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: prNumber });
const sha = pr.head?.sha;
if (!sha) {
core.setFailed('Failed to resolve PR head SHA');
return;
}
const body = context.payload.comment.body || '';
const match = body.match(/\/test-llm-benchmark\s*([a-zA-Z0-9_-]*)/);
const scenario = match ? match[1] : '';
core.setOutput('sha', sha);
core.setOutput('scenario_arg', scenario);
core.setOutput('issue_number', String(context.issue.number));
core.setOutput('comment_user', context.payload.comment.user.login);
} else {
core.setOutput('sha', '${{ github.sha }}');
core.setOutput('scenario_arg', '${{ github.event.inputs.scenario }}');
core.setOutput('issue_number', '');
core.setOutput('comment_user', '');
}

- name: Reply with Action Link
if: github.event_name == 'issue_comment'
Expand All @@ -81,9 +72,9 @@ jobs:
script: |
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const body = [
`✅ **LLM Integration Tests Triggered!**`,
`✅ **LLM Benchmark Triggered!**`,
``,
`@${context.payload.comment.user.login}, I've started the workflow for you.`,
`@${context.payload.comment.user.login}, I've started the benchmark run.`,
`Please click the link below to monitor progress and **approve the deployment** to the test environment:`,
``,
`🚀 **[View Action Run Details](${runUrl})**`,
Expand All @@ -96,17 +87,17 @@ jobs:
body: body
})

llm-integration:
llm-benchmark:
needs: check-permission
permissions:
contents: read
issues: write
environment: test
runs-on: ubuntu-latest
concurrency:
group: llm-integration-${{ github.event.issue.number || github.sha }}
group: llm-benchmark-${{ github.event.issue.number || github.sha }}
cancel-in-progress: true
timeout-minutes: 60
timeout-minutes: 90
env:
NEXT_TELEMETRY_DISABLED: 1
steps:
Expand All @@ -133,33 +124,44 @@ jobs:
- name: Build via sclaw
run: node ./sclaw build

- name: Run LLM integration tests
id: run-tests
- name: Run LLM benchmark
id: run-benchmark
env:
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_MODEL: ${{ github.event.inputs.llm_model || vars.LLM_MODEL }}
LLM_BASE_URL: ${{ vars.LLM_BASE_URL }}
LLM_JUDGE_API_KEY: ${{ secrets.LLM_JUDGE_API_KEY || secrets.LLM_API_KEY }}
LLM_JUDGE_MODEL: ${{ vars.LLM_JUDGE_MODEL }}
LLM_LOG_ENABLED: "true"
LLM_LOG_DIR: ${{ github.workspace }}/.structureclaw/logs
DATABASE_URL: "file:.structureclaw/data/structureclaw-llm-ci.db"
DATABASE_URL: "file:.structureclaw/data/structureclaw-benchmark-ci.db"
run: |
set -o pipefail
FILTER_ARG="${{ needs.check-permission.outputs.filter_arg }}"
if [ -n "$FILTER_ARG" ]; then
node tests/runner.mjs llm-integration --filter "$FILTER_ARG" 2>&1 | tee test-output.txt
SCENARIO_ARG="${{ needs.check-permission.outputs.scenario_arg }}"
if [ -n "$SCENARIO_ARG" ]; then
node tests/runner.mjs llm-benchmark --scenario "$SCENARIO_ARG" --output benchmark-results.json 2>&1 | tee test-output.txt
else
node tests/runner.mjs llm-integration 2>&1 | tee test-output.txt
node tests/runner.mjs llm-benchmark --output benchmark-results.json 2>&1 | tee test-output.txt
fi

- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmark-results
path: |
benchmark-results.json
.structureclaw/logs/

- name: Extract test summary
if: always()
id: summary
run: |
if [ ! -f test-output.txt ]; then
echo "summary=Tests did not produce output." >> "$GITHUB_OUTPUT"
echo "summary=Benchmark did not produce output." >> "$GITHUB_OUTPUT"
exit 0
fi
SUMMARY=$(grep -B 1 -A 3 '^Results:' test-output.txt || echo "No summary found")
SUMMARY=$(grep -B 1 -A 5 '^Benchmark Results:' test-output.txt || echo "No summary found")
echo "summary<<EOF" >> "$GITHUB_OUTPUT"
echo "$SUMMARY" >> "$GITHUB_OUTPUT"
echo "EOF" >> "$GITHUB_OUTPUT"
Expand All @@ -168,27 +170,32 @@ jobs:
if: always() && needs.check-permission.outputs.trigger_issue_number != ''
continue-on-error: true
uses: actions/github-script@v9
env:
ISSUE_NUMBER: ${{ needs.check-permission.outputs.trigger_issue_number }}
COMMENT_USER: ${{ needs.check-permission.outputs.trigger_comment_user }}
BENCHMARK_OUTCOME: ${{ steps.run-benchmark.outcome }}
SUMMARY_JSON: ${{ toJson(steps.summary.outputs.summary) }}
with:
github-token: ${{ secrets.COMMENT_TOKEN }}
script: |
const issueNumber = Number('${{ needs.check-permission.outputs.trigger_issue_number }}');
const commentUser = '${{ needs.check-permission.outputs.trigger_comment_user }}';
const success = '${{ steps.run-tests.outcome }}' === 'success';
const summary = ${{ toJson(steps.summary.outputs.summary) }};
const issueNumber = Number(process.env.ISSUE_NUMBER);
const commentUser = process.env.COMMENT_USER;
const success = process.env.BENCHMARK_OUTCOME === 'success';
const summary = JSON.parse(process.env.SUMMARY_JSON || '""');
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;

const icon = success ? '✅' : '❌';
const status = success ? 'PASSED' : 'FAILED';
const body = [
`${icon} **LLM Integration Tests ${status}**`,
`${icon} **LLM Benchmark ${status}**`,
``,
`@${commentUser}, the test run has completed.`,
`@${commentUser}, the benchmark run has completed.`,
``,
`\`\`\``,
summary,
`\`\`\``,
``,
`📦 **[Download Logs](${runUrl})** or check the Artifacts section.`,
`📦 **[Download Results](${runUrl})** or check the Artifacts section.`,
].join('\n');

await github.rest.issues.createComment({
Expand All @@ -197,12 +204,3 @@ jobs:
repo: context.repo.repo,
body: body
})

- name: Upload LLM call logs
if: always()
uses: actions/upload-artifact@v7
with:
name: llm-logs-ubuntu
path: .structureclaw/logs/*.jsonl
retention-days: 7
if-no-files-found: ignore
3 changes: 2 additions & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ node tests/runner.mjs validate validate-agent-orchestration # Agent orchestrati
node tests/runner.mjs validate validate-chat-stream-contract # Chat stream contract
node tests/runner.mjs validate validate-analyze-contract # Analyze endpoint contract
node tests/runner.mjs smoke-native # Full native install smoke (mirrors CI)
node tests/runner.mjs llm-integration # LLM integration tests (needs LLM_API_KEY)
node tests/runner.mjs llm-benchmark # LLM benchmark: v2 assertions, skill-trace, LLM-as-Judge (needs LLM_API_KEY)
node tests/runner.mjs llm-benchmark --scenario <id> # Run a single benchmark scenario
```

## Key Conventions
Expand Down
Loading
Loading