Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 31 additions & 14 deletions .github/workflows/main-rtt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ jobs:
echo "No package tgz produced." >&2
exit 1
fi
package_version="$(node -e 'const fs = require("node:fs"); process.stdout.write(JSON.parse(fs.readFileSync("package.json", "utf8")).version)')+$(git rev-parse --short=10 HEAD)"
echo "package_tgz=${PWD}/${package_tgz}" >>"$GITHUB_OUTPUT"
echo "package_version=$package_version" >>"$GITHUB_OUTPUT"

- name: Run RTT
id: rtt
Expand All @@ -115,35 +117,50 @@ jobs:
echo "samples must be a positive integer, got: $samples" >&2
exit 1
fi
output="$RUNNER_TEMP/openclaw-rtt-runs/main"
rtt_env=(
OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main"
OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}"
OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw"
OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000
)
if [[ "$samples" != "20" ]]; then
rtt_env+=(OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES="$samples")
fi
metrics_path="$RUNNER_TEMP/openclaw-rtt-resource-metrics.env"
started_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
set +e
/usr/bin/time \
env \
"${rtt_env[@]}" \
/usr/bin/time \
-f 'max_rss_kb=%M\nelapsed_seconds=%e' \
-o "$metrics_path" \
pnpm rtt openclaw@main \
--package-tgz "${{ steps.pack.outputs.package_tgz }}" \
--harness-root "$PWD" \
--output "$RUNNER_TEMP/openclaw-rtt-runs" \
--samples "$samples" \
--timeout-ms 240000 \
--sample-timeout-ms 30000
pnpm test:docker:npm-telegram-live
status="$?"
set -e
result_path="$(find "$RUNNER_TEMP/openclaw-rtt-runs" -maxdepth 3 -name result.json -type f | sort | tail -1)"
if [[ -z "$result_path" ]]; then
echo "No RTT result.json produced." >&2
finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
evidence_path="$output/raw/qa-evidence.json"
if [[ ! -f "$evidence_path" ]]; then
echo "No qa-evidence.json produced." >&2
exit "$status"
fi
echo "result_path=$result_path" >>"$GITHUB_OUTPUT"
echo "resource_metrics_path=$metrics_path" >>"$GITHUB_OUTPUT"
{
echo "evidence_path=$evidence_path"
echo "resource_metrics_path=$metrics_path"
echo "started_at=$started_at"
echo "finished_at=$finished_at"
} >>"$GITHUB_OUTPUT"

- name: Import result
working-directory: openclaw-rtt
shell: bash
run: |
set -euo pipefail
git pull --rebase origin main
node scripts/import-result.mjs "${{ steps.rtt.outputs.result_path }}" \
node scripts/import-result.mjs "${{ steps.rtt.outputs.evidence_path }}" \
--version "${{ steps.pack.outputs.package_version }}" \
--started-at "${{ steps.rtt.outputs.started_at }}" \
--finished-at "${{ steps.rtt.outputs.finished_at }}" \
--resource-metrics "${{ steps.rtt.outputs.resource_metrics_path }}"
node scripts/validate.mjs
node scripts/summary.mjs
Expand Down
81 changes: 40 additions & 41 deletions .github/workflows/stable-release-rtt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,40 +131,43 @@ jobs:
shell: bash
run: |
set -euo pipefail
for spec in ${{ steps.release.outputs.specs }}; do
read -r -a specs <<< "${{ steps.release.outputs.specs }}"
read -r -a versions <<< "${{ steps.release.outputs.versions }}"
for index in "${!specs[@]}"; do
spec="${specs[$index]}"
version="${versions[$index]}"
output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')"
metrics_path="${output}/resource-metrics.env"
mkdir -p "$output"
set +e
/usr/bin/time \
OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \
OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \
OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \
/usr/bin/time \
-f 'max_rss_kb=%M\nelapsed_seconds=%e' \
-o "$metrics_path" \
timeout --kill-after=30s 10m \
pnpm rtt "$spec" \
--harness-root "$PWD" \
--output "$output" \
--samples 20 \
--timeout-ms 240000 \
--sample-timeout-ms 30000
pnpm test:docker:npm-telegram-live
status="$?"
set -e
echo "exit_status=${status}" >>"$metrics_path"
result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)"
if [[ -z "$result_path" ]]; then
echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2
evidence_path="$output/raw/qa-evidence.json"
if [[ ! -f "$evidence_path" ]]; then
echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2
continue
fi
run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")"
run_status="$(node -e 'const fs = require("node:fs"); const evidence = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); const failed = !Array.isArray(evidence.entries) || evidence.entries.some((entry) => entry?.result?.status !== "pass"); process.stdout.write(failed ? "fail" : "pass");' "$evidence_path")"
if [[ "$run_status" != "pass" ]]; then
echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2
echo "Skipping RSS backfill for failed rerun ${evidence_path} (status=${run_status:-missing})." >&2
continue
fi
(
cd ../openclaw-rtt
git pull --rebase origin "${GITHUB_REF_NAME:-main}"
node scripts/backfill-release-rss.mjs \
--family telegram \
--result "$result_path" \
--spec "$spec" \
--version "$version" \
--resource-metrics "$metrics_path"
node scripts/validate.mjs
git add data/channels/telegram/ runs/telegram/
Expand Down Expand Up @@ -195,36 +198,40 @@ jobs:
set -euo pipefail
result_paths="$RUNNER_TEMP/openclaw-rtt-result-paths.txt"
: >"$result_paths"
for spec in ${{ steps.release.outputs.specs }}; do
read -r -a specs <<< "${{ steps.release.outputs.specs }}"
read -r -a versions <<< "${{ steps.release.outputs.versions }}"
for index in "${!specs[@]}"; do
spec="${specs[$index]}"
version="${versions[$index]}"
output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')"
metrics_path="${output}/resource-metrics.env"
mkdir -p "$output"
started_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
set +e
/usr/bin/time \
OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \
OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \
OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \
/usr/bin/time \
-f 'max_rss_kb=%M\nelapsed_seconds=%e' \
-o "$metrics_path" \
pnpm rtt "$spec" \
--harness-root "$PWD" \
--output "$output" \
--samples 20 \
--timeout-ms 240000 \
--sample-timeout-ms 30000
pnpm test:docker:npm-telegram-live
status="$?"
set -e
finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
echo "exit_status=${status}" >>"$metrics_path"
result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)"
if [[ -z "$result_path" ]]; then
echo "No RTT result.json produced for $spec." >&2
evidence_path="$output/raw/qa-evidence.json"
if [[ ! -f "$evidence_path" ]]; then
echo "No qa-evidence.json produced for $spec." >&2
if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then
echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2
echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2
continue
fi
if [[ "$status" -eq 0 ]]; then
status=1
fi
exit "$status"
fi
printf '%s\t%s\n' "$result_path" "$metrics_path" >>"$result_paths"
printf '%s\t%s\t%s\t%s\t%s\n' "$evidence_path" "$metrics_path" "$version" "$started_at" "$finished_at" >>"$result_paths"
done
echo "result_paths=$result_paths" >>"$GITHUB_OUTPUT"

Expand All @@ -235,20 +242,12 @@ jobs:
run: |
set -euo pipefail
git pull --rebase origin "${GITHUB_REF_NAME:-main}"
while IFS=$'\t' read -r result_path metrics_path; do
if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then
run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")"
if [[ "$run_status" != "pass" ]]; then
echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2
continue
fi
node scripts/backfill-release-rss.mjs \
--family telegram \
--result "$result_path" \
--resource-metrics "$metrics_path"
else
node scripts/import-result.mjs "$result_path" --resource-metrics "$metrics_path"
fi
while IFS=$'\t' read -r evidence_path metrics_path version started_at finished_at; do
node scripts/import-result.mjs "$evidence_path" \
--version "$version" \
--started-at "$started_at" \
--finished-at "$finished_at" \
--resource-metrics "$metrics_path"
done <"${{ steps.rtt.outputs.result_paths }}"
node scripts/validate.mjs
node scripts/summary.mjs
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ Latest imported surface run: `2026-06-12T13:45:18.208Z`

## Telegram Release Runs

Telegram release runs use the OpenClaw repo black-box harness on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target normal-reply samples, a 240s canary timeout, and a 30s per-sample timeout.
Telegram release runs use the OpenClaw package Telegram live QA lane on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target RTT checks, a 240s scenario timeout, and a 30s per-check timeout. New rows import aggregate timing from `qa-evidence.json`; older rows imported by the retired package RTT wrapper keep their historical per-sample arrays.

The system under test is the published package running its own Telegram bot. The OpenClaw repo only supplies the mock model server and Telegram driver. `p50` is the median normal-reply RTT. Log notes: [2026-05-02 Testbox stable sweep](logs/2026-05-02-testbox-stable-sweep.md).

Expand Down
4 changes: 2 additions & 2 deletions docs/channel-expansion.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
## Current State

- All imported rows use `data/channels/<channel>/<version>.jsonl` and `runs/<channel>/<run-id>/result.json`.
- Telegram main/release RTT still uses the older `pnpm rtt` source shape, but it now writes through the shared Telegram channel storage path.
- Telegram main/release RTT uses the OpenClaw package Telegram live lane and imports aggregate timing from `qa-evidence.json` through the shared Telegram channel storage path.
- Discord main/release RTT uses the live QA lane with a specialized importer because its summary currently needs observed-message timestamp fallback.
- Slack and WhatsApp main RTT use the reusable live-transport importer.
- The Discord release resolver backfills missing versions from the Telegram release baseline before measuring future versions. It skips releases that predate or fail the Discord canary contract instead of reporting them as runnable gaps.
Expand Down Expand Up @@ -37,7 +37,7 @@ Each sample is wrapped with `/usr/bin/time` and imports process max RSS in kilob

Discord is intentionally not migrated to the generic live-transport importer yet. Its summary currently omits RTT fields, so the generic importer supports observed-message timestamp fallback and has test coverage for that path, but the existing Discord workflow remains stable while the new channel lane proves itself.

Telegram is listed in the channel config for the future live-transport path, but the current production graph remains on the older `pnpm rtt` package-result path because that is what release sweeps already use.
Telegram is listed in the channel config because the package Telegram live lane now emits QA evidence for `telegram-mentioned-message-reply`. Future Telegram rows keep the same dashboard metrics as older rows, but the source is aggregate `qa-evidence.json` timing rather than the retired package RTT wrapper's per-sample result JSON.

Do not read cross-channel values as pure transport rankings. Telegram release rows use `telegram-mentioned-message-reply`; Discord, Slack, and WhatsApp rows use canary scenarios. The live-transport lane also includes QA-lab process overhead in RSS because the measured process is `pnpm openclaw qa <channel>`, not only the channel adapter.

Expand Down
14 changes: 6 additions & 8 deletions docs/data-imports.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
Run importers from the repo root:

```sh
node scripts/import-result.mjs ../clawdbot/runs/<run-id>/result.json
node scripts/import-result.mjs ../clawdbot/runs/<run-id>/result.json --resource-metrics resource-metrics.env
node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live/<run>/qa-evidence.json --version <version> --started-at <iso> --finished-at <iso>
node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live/<run>/qa-evidence.json --version <version> --started-at <iso> --finished-at <iso> --resource-metrics resource-metrics.env
node scripts/import-discord-rtt.mjs samples.tsv --spec openclaw@main --version <ref>
(cd ../openclaw && node --import tsx ../openclaw-rtt/scripts/measure-rpc-rtt.mjs --output-dir ../openclaw-rtt/.artifacts/rpc-rtt/sample-1)
node scripts/import-surface-rtt.mjs rpc-samples.tsv --surface rpc --spec openclaw@main --version <ref> --provider-mode gateway-rpc --scenario rpc-gateway-smoke --require-pass
Expand All @@ -18,13 +18,11 @@ node scripts/backfill-release-rss.mjs --family discord --spec openclaw@2026.5.16
node scripts/summary.mjs
```

Telegram release imports expect the `result.json` shape emitted by:
Telegram release imports expect the aggregate `qa-evidence.json` shape emitted by the OpenClaw package Telegram live lane. The OpenClaw harness checkout must include the package Telegram RTT evidence path; older OpenClaw packages can still be the system under test.

```sh
pnpm rtt openclaw@beta
pnpm rtt openclaw@beta --samples 20
pnpm rtt openclaw@latest
pnpm rtt openclaw@2026.4.30 --provider live-frontier
OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \
pnpm test:docker:npm-telegram-live
```

## Data Layout
Expand All @@ -36,7 +34,7 @@ pnpm rtt openclaw@2026.4.30 --provider live-frontier

Current channel folders are `telegram`, `discord`, `slack`, and `whatsapp`. Telegram and Discord still have specialized importers because their source artifact shapes differ; they now share the same storage contract as generic live-transport channels.

Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later.
Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later. Historical Telegram rows imported from the old package RTT wrapper may include `rtt.warmSamples` with every successful sample value. New Telegram imports from `qa-evidence.json` preserve the aggregate dashboard metrics (`canaryMs`, `mentionReplyMs`, `avgMs`, `p50Ms`, `p95Ms`, `maxMs`, `sampleCount`, and `failedSamples`) but do not reconstruct individual sample RTT arrays because the evidence artifact stores aggregate timing.

Release RSS backfills only write `resources` onto an existing Telegram or Discord row and its copied `result.json`. The backfill command asserts the stored RTT `p50` and `p95` values are unchanged before it rewrites that version's JSONL file. RSS is process-level data around the sampled command, not isolated channel transport memory.

Expand Down
5 changes: 4 additions & 1 deletion scripts/backfill-release-rss.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ async function readResources(args) {
const measurement = {
kind: "process-max-rss",
scope: args.family === "telegram" ? "release-harness-command" : "qa-command",
command: args.family === "telegram" ? "pnpm rtt" : `pnpm openclaw qa ${args.family}`,
command:
args.family === "telegram"
? "pnpm test:docker:npm-telegram-live"
: `pnpm openclaw qa ${args.family}`,
};
if (args.resourceMetricsPath) {
return aggregateResources([await readResourceMetrics(path.resolve(args.resourceMetricsPath))], measurement);
Expand Down
2 changes: 1 addition & 1 deletion scripts/backfill-release-rss.test.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ test("backfills Telegram RSS without touching RTT p50/p95", async () => {
assert.deepEqual(updated.resources.measurement, {
kind: "process-max-rss",
scope: "release-harness-command",
command: "pnpm rtt",
command: "pnpm test:docker:npm-telegram-live",
});
assert.deepEqual(updated.resources.maxRssKbSamples, [409600]);
assert.equal(updated.resources.maxRssKb.max, 409600);
Expand Down
Loading