From ae14551bea887484207357330dd4604f53e40dd8 Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Fri, 12 Jun 2026 16:47:03 -0700 Subject: [PATCH 1/6] ci: import telegram qa evidence rtt --- .github/workflows/main-rtt.yml | 37 +++-- .github/workflows/stable-release-rtt.yml | 82 +++++++---- README.md | 2 +- docs/channel-expansion.md | 4 +- docs/data-imports.md | 12 +- scripts/backfill-release-rss.mjs | 5 +- scripts/backfill-release-rss.test.mjs | 2 +- scripts/import-result.mjs | 178 ++++++++++++++++++++++- scripts/import-result.test.mjs | 160 ++++++++++++++++++++ 9 files changed, 423 insertions(+), 59 deletions(-) create mode 100644 scripts/import-result.test.mjs diff --git a/.github/workflows/main-rtt.yml b/.github/workflows/main-rtt.yml index 12c1de01..fd7ea444 100644 --- a/.github/workflows/main-rtt.yml +++ b/.github/workflows/main-rtt.yml @@ -98,7 +98,9 @@ jobs: echo "No package tgz produced." >&2 exit 1 fi + package_version="$(node -e 'const fs = require("node:fs"); process.stdout.write(JSON.parse(fs.readFileSync("package.json", "utf8")).version)')+$(git rev-parse --short=10 HEAD)" echo "package_tgz=${PWD}/${package_tgz}" >>"$GITHUB_OUTPUT" + echo "package_version=$package_version" >>"$GITHUB_OUTPUT" - name: Run RTT id: rtt @@ -115,27 +117,32 @@ jobs: echo "samples must be a positive integer, got: $samples" >&2 exit 1 fi + output="$RUNNER_TEMP/openclaw-rtt-runs/main" metrics_path="$RUNNER_TEMP/openclaw-rtt-resource-metrics.env" + started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e - /usr/bin/time \ + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main" \ + OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}" \ + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ + OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES="$samples" \ + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ + OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS=30000 \ + /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ - pnpm rtt openclaw@main \ - --package-tgz "${{ steps.pack.outputs.package_tgz }}" \ - --harness-root "$PWD" \ - --output "$RUNNER_TEMP/openclaw-rtt-runs" \ - --samples "$samples" \ - --timeout-ms 240000 \ - --sample-timeout-ms 30000 + pnpm test:docker:npm-telegram-live status="$?" set -e - result_path="$(find "$RUNNER_TEMP/openclaw-rtt-runs" -maxdepth 3 -name result.json -type f | sort | tail -1)" - if [[ -z "$result_path" ]]; then - echo "No RTT result.json produced." >&2 + finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')" + evidence_path="$output/raw/qa-evidence.json" + if [[ ! -f "$evidence_path" ]]; then + echo "No qa-evidence.json produced." >&2 exit "$status" fi - echo "result_path=$result_path" >>"$GITHUB_OUTPUT" + echo "evidence_path=$evidence_path" >>"$GITHUB_OUTPUT" echo "resource_metrics_path=$metrics_path" >>"$GITHUB_OUTPUT" + echo "started_at=$started_at" >>"$GITHUB_OUTPUT" + echo "finished_at=$finished_at" >>"$GITHUB_OUTPUT" - name: Import result working-directory: openclaw-rtt @@ -143,7 +150,11 @@ jobs: run: | set -euo pipefail git pull --rebase origin main - node scripts/import-result.mjs "${{ steps.rtt.outputs.result_path }}" \ + node scripts/import-result.mjs "${{ steps.rtt.outputs.evidence_path }}" \ + --spec openclaw@main \ + --version "${{ steps.pack.outputs.package_version }}" \ + --started-at "${{ steps.rtt.outputs.started_at }}" \ + --finished-at "${{ steps.rtt.outputs.finished_at }}" \ --resource-metrics "${{ steps.rtt.outputs.resource_metrics_path }}" node scripts/validate.mjs node scripts/summary.mjs diff --git a/.github/workflows/stable-release-rtt.yml b/.github/workflows/stable-release-rtt.yml index 11e8929a..6f6aeceb 100644 --- a/.github/workflows/stable-release-rtt.yml +++ b/.github/workflows/stable-release-rtt.yml @@ -131,32 +131,37 @@ jobs: shell: bash run: | set -euo pipefail - for spec in ${{ steps.release.outputs.specs }}; do + read -r -a specs <<< "${{ steps.release.outputs.specs }}" + read -r -a versions <<< "${{ steps.release.outputs.versions }}" + for index in "${!specs[@]}"; do + spec="${specs[$index]}" + version="${versions[$index]}" output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')" metrics_path="${output}/resource-metrics.env" mkdir -p "$output" + started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e - /usr/bin/time \ + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ + OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES=20 \ + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ + OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS=30000 \ + /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ timeout --kill-after=30s 10m \ - pnpm rtt "$spec" \ - --harness-root "$PWD" \ - --output "$output" \ - --samples 20 \ - --timeout-ms 240000 \ - --sample-timeout-ms 30000 + pnpm test:docker:npm-telegram-live status="$?" set -e echo "exit_status=${status}" >>"$metrics_path" - result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)" - if [[ -z "$result_path" ]]; then - echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2 + evidence_path="$output/raw/qa-evidence.json" + if [[ ! -f "$evidence_path" ]]; then + echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2 continue fi - run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")" + run_status="$(node -e 'const fs = require("node:fs"); const evidence = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); const failed = !Array.isArray(evidence.entries) || evidence.entries.some((entry) => entry?.result?.status !== "pass"); process.stdout.write(failed ? "fail" : "pass");' "$evidence_path")" if [[ "$run_status" != "pass" ]]; then - echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2 + echo "Skipping RSS backfill for failed rerun ${evidence_path} (status=${run_status:-missing})." >&2 continue fi ( @@ -164,7 +169,8 @@ jobs: git pull --rebase origin "${GITHUB_REF_NAME:-main}" node scripts/backfill-release-rss.mjs \ --family telegram \ - --result "$result_path" \ + --spec "$spec" \ + --version "$version" \ --resource-metrics "$metrics_path" node scripts/validate.mjs git add data/channels/telegram/ runs/telegram/ @@ -195,28 +201,34 @@ jobs: set -euo pipefail result_paths="$RUNNER_TEMP/openclaw-rtt-result-paths.txt" : >"$result_paths" - for spec in ${{ steps.release.outputs.specs }}; do + read -r -a specs <<< "${{ steps.release.outputs.specs }}" + read -r -a versions <<< "${{ steps.release.outputs.versions }}" + for index in "${!specs[@]}"; do + spec="${specs[$index]}" + version="${versions[$index]}" output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')" metrics_path="${output}/resource-metrics.env" mkdir -p "$output" + started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e - /usr/bin/time \ + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ + OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES=20 \ + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ + OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS=30000 \ + /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ - pnpm rtt "$spec" \ - --harness-root "$PWD" \ - --output "$output" \ - --samples 20 \ - --timeout-ms 240000 \ - --sample-timeout-ms 30000 + pnpm test:docker:npm-telegram-live status="$?" set -e + finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')" echo "exit_status=${status}" >>"$metrics_path" - result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)" - if [[ -z "$result_path" ]]; then - echo "No RTT result.json produced for $spec." >&2 + evidence_path="$output/raw/qa-evidence.json" + if [[ ! -f "$evidence_path" ]]; then + echo "No qa-evidence.json produced for $spec." >&2 if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then - echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2 + echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2 continue fi if [[ "$status" -eq 0 ]]; then @@ -224,7 +236,7 @@ jobs: fi exit "$status" fi - printf '%s\t%s\n' "$result_path" "$metrics_path" >>"$result_paths" + printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$evidence_path" "$metrics_path" "$spec" "$version" "$started_at" "$finished_at" >>"$result_paths" done echo "result_paths=$result_paths" >>"$GITHUB_OUTPUT" @@ -235,19 +247,25 @@ jobs: run: | set -euo pipefail git pull --rebase origin "${GITHUB_REF_NAME:-main}" - while IFS=$'\t' read -r result_path metrics_path; do + while IFS=$'\t' read -r evidence_path metrics_path spec version started_at finished_at; do if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then - run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")" + run_status="$(node -e 'const fs = require("node:fs"); const evidence = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); const failed = !Array.isArray(evidence.entries) || evidence.entries.some((entry) => entry?.result?.status !== "pass"); process.stdout.write(failed ? "fail" : "pass");' "$evidence_path")" if [[ "$run_status" != "pass" ]]; then - echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2 + echo "Skipping RSS backfill for failed rerun ${evidence_path} (status=${run_status:-missing})." >&2 continue fi node scripts/backfill-release-rss.mjs \ --family telegram \ - --result "$result_path" \ + --spec "$spec" \ + --version "$version" \ --resource-metrics "$metrics_path" else - node scripts/import-result.mjs "$result_path" --resource-metrics "$metrics_path" + node scripts/import-result.mjs "$evidence_path" \ + --spec "$spec" \ + --version "$version" \ + --started-at "$started_at" \ + --finished-at "$finished_at" \ + --resource-metrics "$metrics_path" fi done <"${{ steps.rtt.outputs.result_paths }}" node scripts/validate.mjs diff --git a/README.md b/README.md index cb10fe77..2bff622b 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ Latest imported surface run: `2026-06-12T13:45:18.208Z` ## Telegram Release Runs -Telegram release runs use the OpenClaw repo black-box harness on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target normal-reply samples, a 240s canary timeout, and a 30s per-sample timeout. +Telegram release runs use the OpenClaw package Telegram live QA lane on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target normal-reply samples, a 240s scenario timeout, and a 30s per-sample timeout. New rows import aggregate timing from `qa-evidence.json`; older rows imported by the retired package RTT wrapper keep their historical per-sample arrays. The system under test is the published package running its own Telegram bot. The OpenClaw repo only supplies the mock model server and Telegram driver. `p50` is the median normal-reply RTT. Log notes: [2026-05-02 Testbox stable sweep](logs/2026-05-02-testbox-stable-sweep.md). diff --git a/docs/channel-expansion.md b/docs/channel-expansion.md index f6bf351c..496df682 100644 --- a/docs/channel-expansion.md +++ b/docs/channel-expansion.md @@ -5,7 +5,7 @@ ## Current State - All imported rows use `data/channels//.jsonl` and `runs///result.json`. -- Telegram main/release RTT still uses the older `pnpm rtt` source shape, but it now writes through the shared Telegram channel storage path. +- Telegram main/release RTT uses the OpenClaw package Telegram live lane and imports aggregate timing from `qa-evidence.json` through the shared Telegram channel storage path. - Discord main/release RTT uses the live QA lane with a specialized importer because its summary currently needs observed-message timestamp fallback. - Slack and WhatsApp main RTT use the reusable live-transport importer. - The Discord release resolver backfills missing versions from the Telegram release baseline before measuring future versions. It skips releases that predate or fail the Discord canary contract instead of reporting them as runnable gaps. @@ -37,7 +37,7 @@ Each sample is wrapped with `/usr/bin/time` and imports process max RSS in kilob Discord is intentionally not migrated to the generic live-transport importer yet. Its summary currently omits RTT fields, so the generic importer supports observed-message timestamp fallback and has test coverage for that path, but the existing Discord workflow remains stable while the new channel lane proves itself. -Telegram is listed in the channel config for the future live-transport path, but the current production graph remains on the older `pnpm rtt` package-result path because that is what release sweeps already use. +Telegram is listed in the channel config because the package Telegram live lane now emits QA evidence for `telegram-mentioned-message-reply`. Future Telegram rows keep the same dashboard metrics as older rows, but the source is aggregate `qa-evidence.json` timing rather than the retired package RTT wrapper's per-sample result JSON. Do not read cross-channel values as pure transport rankings. Telegram release rows use `telegram-mentioned-message-reply`; Discord, Slack, and WhatsApp rows use canary scenarios. The live-transport lane also includes QA-lab process overhead in RSS because the measured process is `pnpm openclaw qa `, not only the channel adapter. diff --git a/docs/data-imports.md b/docs/data-imports.md index c8b688f2..78bbde88 100644 --- a/docs/data-imports.md +++ b/docs/data-imports.md @@ -15,16 +15,16 @@ node scripts/import-surface-rtt.mjs rpc-samples.tsv --surface rpc --spec opencla node scripts/import-surface-rtt.mjs samples.tsv --surface control-ui --spec openclaw@main --version node scripts/backfill-rpc-surface-rtt.mjs node scripts/backfill-release-rss.mjs --family discord --spec openclaw@2026.5.16 --version 2026.5.16 --sample-paths samples.tsv +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --spec openclaw@beta --version --started-at --finished-at node scripts/summary.mjs ``` -Telegram release imports expect the `result.json` shape emitted by: +Telegram release imports expect the `qa-evidence.json` shape emitted by the OpenClaw package Telegram live lane: ```sh -pnpm rtt openclaw@beta -pnpm rtt openclaw@beta --samples 20 -pnpm rtt openclaw@latest -pnpm rtt openclaw@2026.4.30 --provider live-frontier +OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \ +OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES=20 \ +pnpm test:docker:npm-telegram-live ``` ## Data Layout @@ -36,7 +36,7 @@ pnpm rtt openclaw@2026.4.30 --provider live-frontier Current channel folders are `telegram`, `discord`, `slack`, and `whatsapp`. Telegram and Discord still have specialized importers because their source artifact shapes differ; they now share the same storage contract as generic live-transport channels. -Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later. +Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later. Historical Telegram rows imported from the old package RTT wrapper may include `rtt.warmSamples` with every successful sample value. New Telegram imports from `qa-evidence.json` preserve the aggregate dashboard metrics (`canaryMs`, `mentionReplyMs`, `avgMs`, `p50Ms`, `p95Ms`, `maxMs`, `sampleCount`, and `failedSamples`) but do not reconstruct individual sample RTT arrays because the evidence artifact stores aggregate timing. Release RSS backfills only write `resources` onto an existing Telegram or Discord row and its copied `result.json`. The backfill command asserts the stored RTT `p50` and `p95` values are unchanged before it rewrites that version's JSONL file. RSS is process-level data around the sampled command, not isolated channel transport memory. diff --git a/scripts/backfill-release-rss.mjs b/scripts/backfill-release-rss.mjs index 6ba1d62d..7eb990cf 100644 --- a/scripts/backfill-release-rss.mjs +++ b/scripts/backfill-release-rss.mjs @@ -78,7 +78,10 @@ async function readResources(args) { const measurement = { kind: "process-max-rss", scope: args.family === "telegram" ? "release-harness-command" : "qa-command", - command: args.family === "telegram" ? "pnpm rtt" : `pnpm openclaw qa ${args.family}`, + command: + args.family === "telegram" + ? "pnpm test:docker:npm-telegram-live" + : `pnpm openclaw qa ${args.family}`, }; if (args.resourceMetricsPath) { return aggregateResources([await readResourceMetrics(path.resolve(args.resourceMetricsPath))], measurement); diff --git a/scripts/backfill-release-rss.test.mjs b/scripts/backfill-release-rss.test.mjs index 138cc687..dccce50d 100644 --- a/scripts/backfill-release-rss.test.mjs +++ b/scripts/backfill-release-rss.test.mjs @@ -60,7 +60,7 @@ test("backfills Telegram RSS without touching RTT p50/p95", async () => { assert.deepEqual(updated.resources.measurement, { kind: "process-max-rss", scope: "release-harness-command", - command: "pnpm rtt", + command: "pnpm test:docker:npm-telegram-live", }); assert.deepEqual(updated.resources.maxRssKbSamples, [409600]); assert.equal(updated.resources.maxRssKb.max, 409600); diff --git a/scripts/import-result.mjs b/scripts/import-result.mjs index 332b4e89..d3fd2553 100644 --- a/scripts/import-result.mjs +++ b/scripts/import-result.mjs @@ -16,8 +16,9 @@ const TELEGRAM_CHANNEL = { function usage() { return [ - "Usage: node scripts/import-result.mjs ", + "Usage: node scripts/import-result.mjs ", " [--resource-metrics ]", + " [--spec --version --started-at --finished-at ]", ].join("\n"); } @@ -33,6 +34,22 @@ function parseArgs(argv) { args.resourceMetricsPath = argv[(index += 1)]; continue; } + if (arg === "--spec") { + args.spec = argv[(index += 1)]; + continue; + } + if (arg === "--version") { + args.version = argv[(index += 1)]; + continue; + } + if (arg === "--started-at") { + args.startedAt = argv[(index += 1)]; + continue; + } + if (arg === "--finished-at") { + args.finishedAt = argv[(index += 1)]; + continue; + } throw new Error(`Unknown argument: ${arg}\n${usage()}`); } if (!args.sourcePath) { @@ -68,6 +85,20 @@ function validateOptionalNumber(value, label) { } } +function safeRunLabel(input) { + return input.replace(/[^a-zA-Z0-9.-]+/gu, "_").replace(/^_+|_+$/gu, ""); +} + +function buildEvidenceRunId(startedAt, spec) { + return [ + startedAt.replaceAll(":", "").replaceAll(".", ""), + safeRunLabel(spec), + "telegram", + TELEGRAM_CHANNEL.scenario, + "rtt", + ].join("-"); +} + function validateResult(value) { const result = requireObject(value, "result"); const packageInfo = requireObject(result.package, "result.package"); @@ -107,6 +138,135 @@ function validateResult(value) { return result; } +function isQaEvidenceSummary(value) { + return ( + typeof value === "object" && + value !== null && + !Array.isArray(value) && + value.kind === "openclaw.qa.evidence-summary" + ); +} + +function requireEvidenceArgs(args) { + requireString(args.spec, "--spec"); + requireString(args.version, "--version"); + requireString(args.startedAt, "--started-at"); + requireString(args.finishedAt, "--finished-at"); + const startedAtMs = Date.parse(args.startedAt); + const finishedAtMs = Date.parse(args.finishedAt); + if (!Number.isFinite(startedAtMs)) { + throw new Error("--started-at must be a parseable ISO timestamp."); + } + if (!Number.isFinite(finishedAtMs)) { + throw new Error("--finished-at must be a parseable ISO timestamp."); + } + if (finishedAtMs < startedAtMs) { + throw new Error("--finished-at must be at or after --started-at."); + } + return { finishedAtMs, startedAtMs }; +} + +function evidenceEntry(evidence, testId) { + const entries = Array.isArray(evidence.entries) ? evidence.entries : []; + const entry = entries.find((candidate) => candidate?.test?.id === testId); + if (!entry) { + const available = entries.map((candidate) => candidate?.test?.id).filter(Boolean).join(", "); + throw new Error(`qa evidence missing ${testId}; available: ${available || ""}`); + } + return entry; +} + +function readTiming(entry, label) { + const timing = entry?.result?.timing; + if (!timing || typeof timing !== "object" || Array.isArray(timing)) { + throw new Error(`${label} is missing result.timing.`); + } + return timing; +} + +function finiteTimingNumber(timing, name) { + const value = timing[name]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function providerModeFromEvidence(entry) { + const provider = entry?.execution?.provider; + if (!provider || typeof provider !== "object" || Array.isArray(provider)) { + return "unknown"; + } + if (typeof provider.auth === "string" && provider.auth.trim()) { + return provider.auth; + } + if (typeof provider.fixture === "string" && provider.fixture.trim()) { + return provider.fixture; + } + return provider.live === true ? "live-frontier" : "mock-openai"; +} + +function statusFromEvidence(entries) { + return entries.every((entry) => entry?.result?.status === "pass") ? "pass" : "fail"; +} + +function buildResultFromEvidence(evidence, args) { + const { finishedAtMs, startedAtMs } = requireEvidenceArgs(args); + const canary = evidenceEntry(evidence, "telegram-canary"); + const mention = evidenceEntry(evidence, TELEGRAM_CHANNEL.scenario); + const canaryTiming = readTiming(canary, "telegram-canary"); + const mentionTiming = readTiming(mention, TELEGRAM_CHANNEL.scenario); + const canaryMs = finiteTimingNumber(canaryTiming, "rttMs"); + const mentionReplyMs = + finiteTimingNumber(mentionTiming, "p50Ms") ?? finiteTimingNumber(mentionTiming, "rttMs"); + const sampleCount = finiteTimingNumber(mentionTiming, "samples"); + const failedSamples = finiteTimingNumber(mentionTiming, "failedSamples"); + return { + package: { + spec: args.spec, + version: args.version, + }, + run: { + id: buildEvidenceRunId(args.startedAt, args.spec), + startedAt: args.startedAt, + finishedAt: args.finishedAt, + durationMs: finishedAtMs - startedAtMs, + status: + statusFromEvidence([canary, mention]) === "pass" && + typeof canaryMs === "number" && + typeof mentionReplyMs === "number" + ? "pass" + : "fail", + }, + mode: { + providerMode: providerModeFromEvidence(mention), + scenarios: [TELEGRAM_CHANNEL.scenario], + source: "qa-evidence", + }, + rtt: { + canaryMs, + mentionReplyMs, + avgMs: finiteTimingNumber(mentionTiming, "avgMs"), + p50Ms: finiteTimingNumber(mentionTiming, "p50Ms") ?? mentionReplyMs, + p95Ms: finiteTimingNumber(mentionTiming, "p95Ms"), + maxMs: finiteTimingNumber(mentionTiming, "maxMs"), + failedSamples, + ...(typeof sampleCount === "number" ? { sampleCount } : {}), + sources: ["qa-evidence"], + }, + samples: + typeof sampleCount === "number" + ? [ + { + index: 1, + status: mention?.result?.status === "pass" ? "pass" : "fail", + details: `aggregate timing from qa-evidence.json (${Math.max( + 0, + sampleCount - (failedSamples ?? 0), + )}/${sampleCount} samples passed)`, + }, + ] + : undefined, + }; +} + async function readJson(pathname) { return JSON.parse(await fs.readFile(pathname, "utf8")); } @@ -116,17 +276,29 @@ async function existingRunIds() { return new Set([...ids].filter(Boolean)); } +function resourceMeasurementForResult(result) { + return { + kind: "process-max-rss", + scope: "release-harness-command", + command: + result.mode?.source === "qa-evidence" ? "pnpm test:docker:npm-telegram-live" : "pnpm rtt", + }; +} + async function main() { const args = parseArgs(process.argv.slice(2)); - const result = validateResult(await readJson(path.resolve(args.sourcePath))); + const input = await readJson(path.resolve(args.sourcePath)); + const result = isQaEvidenceSummary(input) + ? buildResultFromEvidence(input, args) + : validateResult(input); const seen = await existingRunIds(); if (seen.has(result.run.id)) { throw new Error(`Run already imported: ${result.run.id}`); } if (args.resourceMetricsPath) { const resourceMetrics = await readResourceMetrics(path.resolve(args.resourceMetricsPath)); - result.resources = aggregateResources([resourceMetrics]); + result.resources = aggregateResources([resourceMetrics], resourceMeasurementForResult(result)); } result.channel = { ...TELEGRAM_CHANNEL, diff --git a/scripts/import-result.test.mjs b/scripts/import-result.test.mjs new file mode 100644 index 00000000..0ed2c4f8 --- /dev/null +++ b/scripts/import-result.test.mjs @@ -0,0 +1,160 @@ +import assert from "node:assert/strict"; +import { execFile } from "node:child_process"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { promisify } from "node:util"; +import test from "node:test"; + +const execFileAsync = promisify(execFile); +const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), ".."); +const IMPORT_SCRIPT = path.join(REPO_ROOT, "scripts/import-result.mjs"); + +async function makeWorkspace() { + return await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-rtt-import-result-test-")); +} + +async function writeJson(pathname, value) { + await fs.mkdir(path.dirname(pathname), { recursive: true }); + await fs.writeFile(pathname, `${JSON.stringify(value, null, 2)}\n`); +} + +async function readJsonl(pathname) { + const text = await fs.readFile(pathname, "utf8"); + return text + .split("\n") + .filter(Boolean) + .map((line) => JSON.parse(line)); +} + +test("imports Telegram qa-evidence as the existing RTT row shape", async () => { + const workspace = await makeWorkspace(); + const evidencePath = path.join(workspace, "qa-evidence.json"); + const metricsPath = path.join(workspace, "resource-metrics.env"); + await writeJson(evidencePath, { + kind: "openclaw.qa.evidence-summary", + schemaVersion: 2, + generatedAt: "2026-06-12T20:00:20.000Z", + entries: [ + { + test: { + kind: "live-transport-check", + id: "telegram-canary", + title: "Telegram canary", + }, + execution: { + provider: { + id: "openai", + live: false, + fixture: "mock-openai", + }, + }, + result: { + status: "pass", + timing: { + rttMs: 900, + }, + }, + }, + { + test: { + kind: "live-transport-check", + id: "telegram-mentioned-message-reply", + title: "Telegram mentioned message gets a reply", + }, + execution: { + provider: { + id: "openai", + live: false, + fixture: "mock-openai", + }, + }, + result: { + status: "pass", + timing: { + rttMs: 1200, + avgMs: 1300, + p50Ms: 1200, + p95Ms: 1800, + maxMs: 2200, + samples: 5, + failedSamples: 1, + }, + }, + }, + ], + }); + await fs.writeFile(metricsPath, "max_rss_kb=204800\nelapsed_seconds=22.5\n"); + + await execFileAsync( + process.execPath, + [ + IMPORT_SCRIPT, + evidencePath, + "--spec", + "openclaw@main", + "--version", + "2026.6.2+abcdef1234", + "--started-at", + "2026-06-12T20:00:00.000Z", + "--finished-at", + "2026-06-12T20:00:30.000Z", + "--resource-metrics", + metricsPath, + ], + { cwd: workspace }, + ); + + const [row] = await readJsonl( + path.join(workspace, "data/channels/telegram/2026.6.2+abcdef1234.jsonl"), + ); + assert.deepEqual(row.channel, { + id: "telegram", + label: "Telegram", + scenario: "telegram-mentioned-message-reply", + }); + assert.equal(row.package.spec, "openclaw@main"); + assert.equal(row.package.version, "2026.6.2+abcdef1234"); + assert.equal(row.run.status, "pass"); + assert.equal(row.run.durationMs, 30_000); + assert.equal(row.mode.providerMode, "mock-openai"); + assert.equal(row.mode.source, "qa-evidence"); + assert.equal(row.rtt.canaryMs, 900); + assert.equal(row.rtt.mentionReplyMs, 1200); + assert.equal(row.rtt.avgMs, 1300); + assert.equal(row.rtt.p50Ms, 1200); + assert.equal(row.rtt.p95Ms, 1800); + assert.equal(row.rtt.maxMs, 2200); + assert.equal(row.rtt.sampleCount, 5); + assert.equal(row.rtt.failedSamples, 1); + assert.deepEqual(row.rtt.sources, ["qa-evidence"]); + assert.deepEqual(row.resources.measurement, { + kind: "process-max-rss", + scope: "release-harness-command", + command: "pnpm test:docker:npm-telegram-live", + }); + assert.deepEqual(row.resources.maxRssKbSamples, [204800]); + assert.equal(row.resources.maxRssKb.p50, 204800); + assert.deepEqual(row.resources.elapsedSecondsSamples, [22.5]); + assert.deepEqual(row.artifacts, { + resultPath: row.artifacts.resultPath, + }); + assert.match(row.artifacts.resultPath, /^runs\/telegram\/.+\/result\.json$/u); +}); + +test("requires package metadata for qa-evidence imports", async () => { + const workspace = await makeWorkspace(); + const evidencePath = path.join(workspace, "qa-evidence.json"); + await writeJson(evidencePath, { + kind: "openclaw.qa.evidence-summary", + schemaVersion: 2, + generatedAt: "2026-06-12T20:00:20.000Z", + entries: [], + }); + + await assert.rejects( + execFileAsync(process.execPath, [IMPORT_SCRIPT, evidencePath], { cwd: workspace }), + /--spec must be a non-empty string/u, + ); +}); From f256a8fde1f6afa067a2eafc099f5c296ec49470 Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Fri, 12 Jun 2026 16:52:03 -0700 Subject: [PATCH 2/6] refactor: require qa evidence telegram imports --- docs/data-imports.md | 5 ++- scripts/import-result.mjs | 68 ++++----------------------------------- 2 files changed, 9 insertions(+), 64 deletions(-) diff --git a/docs/data-imports.md b/docs/data-imports.md index 78bbde88..8efa965f 100644 --- a/docs/data-imports.md +++ b/docs/data-imports.md @@ -7,15 +7,14 @@ Run importers from the repo root: ```sh -node scripts/import-result.mjs ../clawdbot/runs//result.json -node scripts/import-result.mjs ../clawdbot/runs//result.json --resource-metrics resource-metrics.env +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --spec openclaw@beta --version --started-at --finished-at +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --spec openclaw@beta --version --started-at --finished-at --resource-metrics resource-metrics.env node scripts/import-discord-rtt.mjs samples.tsv --spec openclaw@main --version (cd ../openclaw && node --import tsx ../openclaw-rtt/scripts/measure-rpc-rtt.mjs --output-dir ../openclaw-rtt/.artifacts/rpc-rtt/sample-1) node scripts/import-surface-rtt.mjs rpc-samples.tsv --surface rpc --spec openclaw@main --version --provider-mode gateway-rpc --scenario rpc-gateway-smoke --require-pass node scripts/import-surface-rtt.mjs samples.tsv --surface control-ui --spec openclaw@main --version node scripts/backfill-rpc-surface-rtt.mjs node scripts/backfill-release-rss.mjs --family discord --spec openclaw@2026.5.16 --version 2026.5.16 --sample-paths samples.tsv -node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --spec openclaw@beta --version --started-at --finished-at node scripts/summary.mjs ``` diff --git a/scripts/import-result.mjs b/scripts/import-result.mjs index d3fd2553..fc33aa0d 100644 --- a/scripts/import-result.mjs +++ b/scripts/import-result.mjs @@ -16,9 +16,9 @@ const TELEGRAM_CHANNEL = { function usage() { return [ - "Usage: node scripts/import-result.mjs ", + "Usage: node scripts/import-result.mjs ", " [--resource-metrics ]", - " [--spec --version --started-at --finished-at ]", + " --spec --version --started-at --finished-at ", ].join("\n"); } @@ -79,12 +79,6 @@ function requireNumber(value, label) { return value; } -function validateOptionalNumber(value, label) { - if (value !== undefined) { - requireNumber(value, label); - } -} - function safeRunLabel(input) { return input.replace(/[^a-zA-Z0-9.-]+/gu, "_").replace(/^_+|_+$/gu, ""); } @@ -99,54 +93,6 @@ function buildEvidenceRunId(startedAt, spec) { ].join("-"); } -function validateResult(value) { - const result = requireObject(value, "result"); - const packageInfo = requireObject(result.package, "result.package"); - const run = requireObject(result.run, "result.run"); - const mode = requireObject(result.mode, "result.mode"); - const rtt = requireObject(result.rtt, "result.rtt"); - - requireString(packageInfo.spec, "result.package.spec"); - requireString(packageInfo.version, "result.package.version"); - requireString(run.id, "result.run.id"); - requireString(run.startedAt, "result.run.startedAt"); - requireString(run.finishedAt, "result.run.finishedAt"); - requireNumber(run.durationMs, "result.run.durationMs"); - if (run.status !== "pass" && run.status !== "fail") { - throw new Error("result.run.status must be pass or fail."); - } - requireString(mode.providerMode, "result.mode.providerMode"); - if (!Array.isArray(mode.scenarios)) { - throw new Error("result.mode.scenarios must be an array."); - } - validateOptionalNumber(rtt.canaryMs, "result.rtt.canaryMs"); - validateOptionalNumber(rtt.mentionReplyMs, "result.rtt.mentionReplyMs"); - validateOptionalNumber(rtt.avgMs, "result.rtt.avgMs"); - validateOptionalNumber(rtt.p50Ms, "result.rtt.p50Ms"); - validateOptionalNumber(rtt.p95Ms, "result.rtt.p95Ms"); - validateOptionalNumber(rtt.maxMs, "result.rtt.maxMs"); - validateOptionalNumber(rtt.failedSamples, "result.rtt.failedSamples"); - if (rtt.warmSamples !== undefined) { - if (!Array.isArray(rtt.warmSamples)) { - throw new Error("result.rtt.warmSamples must be an array."); - } - rtt.warmSamples.forEach((sample, index) => { - requireNumber(sample, `result.rtt.warmSamples[${index}]`); - }); - } - - return result; -} - -function isQaEvidenceSummary(value) { - return ( - typeof value === "object" && - value !== null && - !Array.isArray(value) && - value.kind === "openclaw.qa.evidence-summary" - ); -} - function requireEvidenceArgs(args) { requireString(args.spec, "--spec"); requireString(args.version, "--version"); @@ -208,6 +154,9 @@ function statusFromEvidence(entries) { } function buildResultFromEvidence(evidence, args) { + if (evidence.kind !== "openclaw.qa.evidence-summary") { + throw new Error("input must be an OpenClaw qa-evidence.json summary."); + } const { finishedAtMs, startedAtMs } = requireEvidenceArgs(args); const canary = evidenceEntry(evidence, "telegram-canary"); const mention = evidenceEntry(evidence, TELEGRAM_CHANNEL.scenario); @@ -280,8 +229,7 @@ function resourceMeasurementForResult(result) { return { kind: "process-max-rss", scope: "release-harness-command", - command: - result.mode?.source === "qa-evidence" ? "pnpm test:docker:npm-telegram-live" : "pnpm rtt", + command: "pnpm test:docker:npm-telegram-live", }; } @@ -289,9 +237,7 @@ async function main() { const args = parseArgs(process.argv.slice(2)); const input = await readJson(path.resolve(args.sourcePath)); - const result = isQaEvidenceSummary(input) - ? buildResultFromEvidence(input, args) - : validateResult(input); + const result = buildResultFromEvidence(requireObject(input, "input"), args); const seen = await existingRunIds(); if (seen.has(result.run.id)) { throw new Error(`Run already imported: ${result.run.id}`); From bfea58b8e7efaaae0996518393ef328059747887 Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Fri, 12 Jun 2026 17:18:24 -0700 Subject: [PATCH 3/6] ci: fix rtt workflow lint --- .github/workflows/main-rtt.yml | 10 ++++++---- .github/workflows/stable-release-rtt.yml | 1 - 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main-rtt.yml b/.github/workflows/main-rtt.yml index fd7ea444..a547a354 100644 --- a/.github/workflows/main-rtt.yml +++ b/.github/workflows/main-rtt.yml @@ -139,10 +139,12 @@ jobs: echo "No qa-evidence.json produced." >&2 exit "$status" fi - echo "evidence_path=$evidence_path" >>"$GITHUB_OUTPUT" - echo "resource_metrics_path=$metrics_path" >>"$GITHUB_OUTPUT" - echo "started_at=$started_at" >>"$GITHUB_OUTPUT" - echo "finished_at=$finished_at" >>"$GITHUB_OUTPUT" + { + echo "evidence_path=$evidence_path" + echo "resource_metrics_path=$metrics_path" + echo "started_at=$started_at" + echo "finished_at=$finished_at" + } >>"$GITHUB_OUTPUT" - name: Import result working-directory: openclaw-rtt diff --git a/.github/workflows/stable-release-rtt.yml b/.github/workflows/stable-release-rtt.yml index 6f6aeceb..61313401 100644 --- a/.github/workflows/stable-release-rtt.yml +++ b/.github/workflows/stable-release-rtt.yml @@ -139,7 +139,6 @@ jobs: output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')" metrics_path="${output}/resource-metrics.env" mkdir -p "$output" - started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ From c1a74ca2f7a1d1997fb4aa5ab576e8e72fe1b5a5 Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Fri, 12 Jun 2026 19:14:23 -0700 Subject: [PATCH 4/6] Align Telegram RTT workflow env names --- .github/workflows/main-rtt.yml | 4 ++-- .github/workflows/stable-release-rtt.yml | 8 ++++---- README.md | 2 +- docs/data-imports.md | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/main-rtt.yml b/.github/workflows/main-rtt.yml index a547a354..4e91d7e1 100644 --- a/.github/workflows/main-rtt.yml +++ b/.github/workflows/main-rtt.yml @@ -124,9 +124,9 @@ jobs: OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main" \ OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}" \ OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ - OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES="$samples" \ + OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES="$samples" \ OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ - OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS=30000 \ + OPENCLAW_NPM_TELEGRAM_RTT_TIMEOUT_MS=30000 \ /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ diff --git a/.github/workflows/stable-release-rtt.yml b/.github/workflows/stable-release-rtt.yml index 61313401..6a2fcfcb 100644 --- a/.github/workflows/stable-release-rtt.yml +++ b/.github/workflows/stable-release-rtt.yml @@ -142,9 +142,9 @@ jobs: set +e OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ - OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES=20 \ + OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES=20 \ OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ - OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS=30000 \ + OPENCLAW_NPM_TELEGRAM_RTT_TIMEOUT_MS=30000 \ /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ @@ -212,9 +212,9 @@ jobs: set +e OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ - OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES=20 \ + OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES=20 \ OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ - OPENCLAW_NPM_TELEGRAM_SAMPLE_TIMEOUT_MS=30000 \ + OPENCLAW_NPM_TELEGRAM_RTT_TIMEOUT_MS=30000 \ /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ diff --git a/README.md b/README.md index 2bff622b..7a0a86a7 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ Latest imported surface run: `2026-06-12T13:45:18.208Z` ## Telegram Release Runs -Telegram release runs use the OpenClaw package Telegram live QA lane on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target normal-reply samples, a 240s scenario timeout, and a 30s per-sample timeout. New rows import aggregate timing from `qa-evidence.json`; older rows imported by the retired package RTT wrapper keep their historical per-sample arrays. +Telegram release runs use the OpenClaw package Telegram live QA lane on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target RTT checks, a 240s scenario timeout, and a 30s per-check timeout. New rows import aggregate timing from `qa-evidence.json`; older rows imported by the retired package RTT wrapper keep their historical per-sample arrays. The system under test is the published package running its own Telegram bot. The OpenClaw repo only supplies the mock model server and Telegram driver. `p50` is the median normal-reply RTT. Log notes: [2026-05-02 Testbox stable sweep](logs/2026-05-02-testbox-stable-sweep.md). diff --git a/docs/data-imports.md b/docs/data-imports.md index 8efa965f..b319b60d 100644 --- a/docs/data-imports.md +++ b/docs/data-imports.md @@ -22,7 +22,7 @@ Telegram release imports expect the `qa-evidence.json` shape emitted by the Open ```sh OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \ -OPENCLAW_NPM_TELEGRAM_WARM_SAMPLES=20 \ +OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES=20 \ pnpm test:docker:npm-telegram-live ``` From 9068cb311ba1328f06b6183f3d9a470dcaf37cb0 Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Sun, 14 Jun 2026 00:27:47 -0700 Subject: [PATCH 5/6] Guard Telegram imports on aggregate QA evidence --- docs/data-imports.md | 2 +- scripts/import-result.mjs | 44 +++++++++++++++----------- scripts/import-result.test.mjs | 58 ++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 19 deletions(-) diff --git a/docs/data-imports.md b/docs/data-imports.md index b319b60d..f7c7af76 100644 --- a/docs/data-imports.md +++ b/docs/data-imports.md @@ -18,7 +18,7 @@ node scripts/backfill-release-rss.mjs --family discord --spec openclaw@2026.5.16 node scripts/summary.mjs ``` -Telegram release imports expect the `qa-evidence.json` shape emitted by the OpenClaw package Telegram live lane: +Telegram release imports expect the aggregate `qa-evidence.json` shape emitted by the OpenClaw package Telegram live lane. The OpenClaw harness checkout must include the package Telegram RTT evidence path; older OpenClaw packages can still be the system under test. ```sh OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \ diff --git a/scripts/import-result.mjs b/scripts/import-result.mjs index fc33aa0d..72ca0661 100644 --- a/scripts/import-result.mjs +++ b/scripts/import-result.mjs @@ -135,6 +135,14 @@ function finiteTimingNumber(timing, name) { return typeof value === "number" && Number.isFinite(value) ? value : undefined; } +function requirePositiveTimingNumber(timing, name, label) { + const value = finiteTimingNumber(timing, name); + if (value === undefined || value <= 0) { + throw new Error(`${label} must include positive result.timing.${name}.`); + } + return value; +} + function providerModeFromEvidence(entry) { const provider = entry?.execution?.provider; if (!provider || typeof provider !== "object" || Array.isArray(provider)) { @@ -163,9 +171,12 @@ function buildResultFromEvidence(evidence, args) { const canaryTiming = readTiming(canary, "telegram-canary"); const mentionTiming = readTiming(mention, TELEGRAM_CHANNEL.scenario); const canaryMs = finiteTimingNumber(canaryTiming, "rttMs"); - const mentionReplyMs = - finiteTimingNumber(mentionTiming, "p50Ms") ?? finiteTimingNumber(mentionTiming, "rttMs"); - const sampleCount = finiteTimingNumber(mentionTiming, "samples"); + const sampleCount = requirePositiveTimingNumber( + mentionTiming, + "samples", + TELEGRAM_CHANNEL.scenario, + ); + const mentionReplyMs = finiteTimingNumber(mentionTiming, "p50Ms"); const failedSamples = finiteTimingNumber(mentionTiming, "failedSamples"); return { package: { @@ -193,26 +204,23 @@ function buildResultFromEvidence(evidence, args) { canaryMs, mentionReplyMs, avgMs: finiteTimingNumber(mentionTiming, "avgMs"), - p50Ms: finiteTimingNumber(mentionTiming, "p50Ms") ?? mentionReplyMs, + p50Ms: mentionReplyMs, p95Ms: finiteTimingNumber(mentionTiming, "p95Ms"), maxMs: finiteTimingNumber(mentionTiming, "maxMs"), failedSamples, - ...(typeof sampleCount === "number" ? { sampleCount } : {}), + sampleCount, sources: ["qa-evidence"], }, - samples: - typeof sampleCount === "number" - ? [ - { - index: 1, - status: mention?.result?.status === "pass" ? "pass" : "fail", - details: `aggregate timing from qa-evidence.json (${Math.max( - 0, - sampleCount - (failedSamples ?? 0), - )}/${sampleCount} samples passed)`, - }, - ] - : undefined, + samples: [ + { + index: 1, + status: mention?.result?.status === "pass" ? "pass" : "fail", + details: `aggregate timing from qa-evidence.json (${Math.max( + 0, + sampleCount - (failedSamples ?? 0), + )}/${sampleCount} samples passed)`, + }, + ], }; } diff --git a/scripts/import-result.test.mjs b/scripts/import-result.test.mjs index 0ed2c4f8..a83ee89d 100644 --- a/scripts/import-result.test.mjs +++ b/scripts/import-result.test.mjs @@ -158,3 +158,61 @@ test("requires package metadata for qa-evidence imports", async () => { /--spec must be a non-empty string/u, ); }); + +test("rejects qa-evidence without aggregate Telegram RTT samples", async () => { + const workspace = await makeWorkspace(); + const evidencePath = path.join(workspace, "qa-evidence.json"); + await writeJson(evidencePath, { + kind: "openclaw.qa.evidence-summary", + schemaVersion: 2, + generatedAt: "2026-06-12T20:00:20.000Z", + entries: [ + { + test: { + kind: "live-transport-check", + id: "telegram-canary", + title: "Telegram canary", + }, + result: { + status: "pass", + timing: { + rttMs: 900, + }, + }, + }, + { + test: { + kind: "live-transport-check", + id: "telegram-mentioned-message-reply", + title: "Telegram mentioned message gets a reply", + }, + result: { + status: "pass", + timing: { + rttMs: 1200, + }, + }, + }, + ], + }); + + await assert.rejects( + execFileAsync( + process.execPath, + [ + IMPORT_SCRIPT, + evidencePath, + "--spec", + "openclaw@main", + "--version", + "2026.6.2+abcdef1234", + "--started-at", + "2026-06-12T20:00:00.000Z", + "--finished-at", + "2026-06-12T20:00:30.000Z", + ], + { cwd: workspace }, + ), + /telegram-mentioned-message-reply must include positive result\.timing\.samples/u, + ); +}); From 16f4ee71a340591a3c383f50415346d79aa67f5d Mon Sep 17 00:00:00 2001 From: Dallin Romney Date: Sun, 14 Jun 2026 01:42:49 -0700 Subject: [PATCH 6/6] refactor: simplify telegram evidence import wiring --- .github/workflows/main-rtt.yml | 18 ++++++++----- .github/workflows/stable-release-rtt.yml | 32 +++++----------------- docs/data-imports.md | 5 ++-- scripts/import-result.mjs | 17 ++++++------ scripts/import-result.test.mjs | 34 ++++++++++-------------- 5 files changed, 43 insertions(+), 63 deletions(-) diff --git a/.github/workflows/main-rtt.yml b/.github/workflows/main-rtt.yml index 4e91d7e1..03ee17af 100644 --- a/.github/workflows/main-rtt.yml +++ b/.github/workflows/main-rtt.yml @@ -118,15 +118,20 @@ jobs: exit 1 fi output="$RUNNER_TEMP/openclaw-rtt-runs/main" + rtt_env=( + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main" + OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}" + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 + ) + if [[ "$samples" != "20" ]]; then + rtt_env+=(OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES="$samples") + fi metrics_path="$RUNNER_TEMP/openclaw-rtt-resource-metrics.env" started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e - OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main" \ - OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}" \ - OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ - OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES="$samples" \ - OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ - OPENCLAW_NPM_TELEGRAM_RTT_TIMEOUT_MS=30000 \ + env \ + "${rtt_env[@]}" \ /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ @@ -153,7 +158,6 @@ jobs: set -euo pipefail git pull --rebase origin main node scripts/import-result.mjs "${{ steps.rtt.outputs.evidence_path }}" \ - --spec openclaw@main \ --version "${{ steps.pack.outputs.package_version }}" \ --started-at "${{ steps.rtt.outputs.started_at }}" \ --finished-at "${{ steps.rtt.outputs.finished_at }}" \ diff --git a/.github/workflows/stable-release-rtt.yml b/.github/workflows/stable-release-rtt.yml index 6a2fcfcb..40e7ea2e 100644 --- a/.github/workflows/stable-release-rtt.yml +++ b/.github/workflows/stable-release-rtt.yml @@ -142,9 +142,7 @@ jobs: set +e OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ - OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES=20 \ OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ - OPENCLAW_NPM_TELEGRAM_RTT_TIMEOUT_MS=30000 \ /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ @@ -212,9 +210,7 @@ jobs: set +e OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ - OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES=20 \ OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ - OPENCLAW_NPM_TELEGRAM_RTT_TIMEOUT_MS=30000 \ /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ @@ -235,7 +231,7 @@ jobs: fi exit "$status" fi - printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$evidence_path" "$metrics_path" "$spec" "$version" "$started_at" "$finished_at" >>"$result_paths" + printf '%s\t%s\t%s\t%s\t%s\n' "$evidence_path" "$metrics_path" "$version" "$started_at" "$finished_at" >>"$result_paths" done echo "result_paths=$result_paths" >>"$GITHUB_OUTPUT" @@ -246,26 +242,12 @@ jobs: run: | set -euo pipefail git pull --rebase origin "${GITHUB_REF_NAME:-main}" - while IFS=$'\t' read -r evidence_path metrics_path spec version started_at finished_at; do - if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then - run_status="$(node -e 'const fs = require("node:fs"); const evidence = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); const failed = !Array.isArray(evidence.entries) || evidence.entries.some((entry) => entry?.result?.status !== "pass"); process.stdout.write(failed ? "fail" : "pass");' "$evidence_path")" - if [[ "$run_status" != "pass" ]]; then - echo "Skipping RSS backfill for failed rerun ${evidence_path} (status=${run_status:-missing})." >&2 - continue - fi - node scripts/backfill-release-rss.mjs \ - --family telegram \ - --spec "$spec" \ - --version "$version" \ - --resource-metrics "$metrics_path" - else - node scripts/import-result.mjs "$evidence_path" \ - --spec "$spec" \ - --version "$version" \ - --started-at "$started_at" \ - --finished-at "$finished_at" \ - --resource-metrics "$metrics_path" - fi + while IFS=$'\t' read -r evidence_path metrics_path version started_at finished_at; do + node scripts/import-result.mjs "$evidence_path" \ + --version "$version" \ + --started-at "$started_at" \ + --finished-at "$finished_at" \ + --resource-metrics "$metrics_path" done <"${{ steps.rtt.outputs.result_paths }}" node scripts/validate.mjs node scripts/summary.mjs diff --git a/docs/data-imports.md b/docs/data-imports.md index f7c7af76..ad3d75a3 100644 --- a/docs/data-imports.md +++ b/docs/data-imports.md @@ -7,8 +7,8 @@ Run importers from the repo root: ```sh -node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --spec openclaw@beta --version --started-at --finished-at -node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --spec openclaw@beta --version --started-at --finished-at --resource-metrics resource-metrics.env +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --version --started-at --finished-at +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --version --started-at --finished-at --resource-metrics resource-metrics.env node scripts/import-discord-rtt.mjs samples.tsv --spec openclaw@main --version (cd ../openclaw && node --import tsx ../openclaw-rtt/scripts/measure-rpc-rtt.mjs --output-dir ../openclaw-rtt/.artifacts/rpc-rtt/sample-1) node scripts/import-surface-rtt.mjs rpc-samples.tsv --surface rpc --spec openclaw@main --version --provider-mode gateway-rpc --scenario rpc-gateway-smoke --require-pass @@ -22,7 +22,6 @@ Telegram release imports expect the aggregate `qa-evidence.json` shape emitted b ```sh OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \ -OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES=20 \ pnpm test:docker:npm-telegram-live ``` diff --git a/scripts/import-result.mjs b/scripts/import-result.mjs index 72ca0661..8acc6b0f 100644 --- a/scripts/import-result.mjs +++ b/scripts/import-result.mjs @@ -18,7 +18,7 @@ function usage() { return [ "Usage: node scripts/import-result.mjs ", " [--resource-metrics ]", - " --spec --version --started-at --finished-at ", + " --version --started-at --finished-at ", ].join("\n"); } @@ -34,10 +34,6 @@ function parseArgs(argv) { args.resourceMetricsPath = argv[(index += 1)]; continue; } - if (arg === "--spec") { - args.spec = argv[(index += 1)]; - continue; - } if (arg === "--version") { args.version = argv[(index += 1)]; continue; @@ -94,7 +90,6 @@ function buildEvidenceRunId(startedAt, spec) { } function requireEvidenceArgs(args) { - requireString(args.spec, "--spec"); requireString(args.version, "--version"); requireString(args.startedAt, "--started-at"); requireString(args.finishedAt, "--finished-at"); @@ -122,6 +117,11 @@ function evidenceEntry(evidence, testId) { return entry; } +function packageSpecFromEvidence(entry) { + const spec = entry?.execution?.packageSource?.spec; + return requireString(spec, "qa evidence execution.packageSource.spec"); +} + function readTiming(entry, label) { const timing = entry?.result?.timing; if (!timing || typeof timing !== "object" || Array.isArray(timing)) { @@ -168,6 +168,7 @@ function buildResultFromEvidence(evidence, args) { const { finishedAtMs, startedAtMs } = requireEvidenceArgs(args); const canary = evidenceEntry(evidence, "telegram-canary"); const mention = evidenceEntry(evidence, TELEGRAM_CHANNEL.scenario); + const packageSpec = packageSpecFromEvidence(mention); const canaryTiming = readTiming(canary, "telegram-canary"); const mentionTiming = readTiming(mention, TELEGRAM_CHANNEL.scenario); const canaryMs = finiteTimingNumber(canaryTiming, "rttMs"); @@ -180,11 +181,11 @@ function buildResultFromEvidence(evidence, args) { const failedSamples = finiteTimingNumber(mentionTiming, "failedSamples"); return { package: { - spec: args.spec, + spec: packageSpec, version: args.version, }, run: { - id: buildEvidenceRunId(args.startedAt, args.spec), + id: buildEvidenceRunId(args.startedAt, packageSpec), startedAt: args.startedAt, finishedAt: args.finishedAt, durationMs: finishedAtMs - startedAtMs, diff --git a/scripts/import-result.test.mjs b/scripts/import-result.test.mjs index a83ee89d..1d6c7f5b 100644 --- a/scripts/import-result.test.mjs +++ b/scripts/import-result.test.mjs @@ -44,6 +44,10 @@ test("imports Telegram qa-evidence as the existing RTT row shape", async () => { title: "Telegram canary", }, execution: { + packageSource: { + kind: "npm-package", + spec: "openclaw@main", + }, provider: { id: "openai", live: false, @@ -64,6 +68,10 @@ test("imports Telegram qa-evidence as the existing RTT row shape", async () => { title: "Telegram mentioned message gets a reply", }, execution: { + packageSource: { + kind: "npm-package", + spec: "openclaw@main", + }, provider: { id: "openai", live: false, @@ -92,8 +100,6 @@ test("imports Telegram qa-evidence as the existing RTT row shape", async () => { [ IMPORT_SCRIPT, evidencePath, - "--spec", - "openclaw@main", "--version", "2026.6.2+abcdef1234", "--started-at", @@ -143,22 +149,6 @@ test("imports Telegram qa-evidence as the existing RTT row shape", async () => { assert.match(row.artifacts.resultPath, /^runs\/telegram\/.+\/result\.json$/u); }); -test("requires package metadata for qa-evidence imports", async () => { - const workspace = await makeWorkspace(); - const evidencePath = path.join(workspace, "qa-evidence.json"); - await writeJson(evidencePath, { - kind: "openclaw.qa.evidence-summary", - schemaVersion: 2, - generatedAt: "2026-06-12T20:00:20.000Z", - entries: [], - }); - - await assert.rejects( - execFileAsync(process.execPath, [IMPORT_SCRIPT, evidencePath], { cwd: workspace }), - /--spec must be a non-empty string/u, - ); -}); - test("rejects qa-evidence without aggregate Telegram RTT samples", async () => { const workspace = await makeWorkspace(); const evidencePath = path.join(workspace, "qa-evidence.json"); @@ -186,6 +176,12 @@ test("rejects qa-evidence without aggregate Telegram RTT samples", async () => { id: "telegram-mentioned-message-reply", title: "Telegram mentioned message gets a reply", }, + execution: { + packageSource: { + kind: "npm-package", + spec: "openclaw@main", + }, + }, result: { status: "pass", timing: { @@ -202,8 +198,6 @@ test("rejects qa-evidence without aggregate Telegram RTT samples", async () => { [ IMPORT_SCRIPT, evidencePath, - "--spec", - "openclaw@main", "--version", "2026.6.2+abcdef1234", "--started-at",