diff --git a/.github/workflows/main-rtt.yml b/.github/workflows/main-rtt.yml index 12c1de01..03ee17af 100644 --- a/.github/workflows/main-rtt.yml +++ b/.github/workflows/main-rtt.yml @@ -98,7 +98,9 @@ jobs: echo "No package tgz produced." >&2 exit 1 fi + package_version="$(node -e 'const fs = require("node:fs"); process.stdout.write(JSON.parse(fs.readFileSync("package.json", "utf8")).version)')+$(git rev-parse --short=10 HEAD)" echo "package_tgz=${PWD}/${package_tgz}" >>"$GITHUB_OUTPUT" + echo "package_version=$package_version" >>"$GITHUB_OUTPUT" - name: Run RTT id: rtt @@ -115,27 +117,39 @@ jobs: echo "samples must be a positive integer, got: $samples" >&2 exit 1 fi + output="$RUNNER_TEMP/openclaw-rtt-runs/main" + rtt_env=( + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main" + OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}" + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 + ) + if [[ "$samples" != "20" ]]; then + rtt_env+=(OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES="$samples") + fi metrics_path="$RUNNER_TEMP/openclaw-rtt-resource-metrics.env" + started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e - /usr/bin/time \ + env \ + "${rtt_env[@]}" \ + /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ - pnpm rtt openclaw@main \ - --package-tgz "${{ steps.pack.outputs.package_tgz }}" \ - --harness-root "$PWD" \ - --output "$RUNNER_TEMP/openclaw-rtt-runs" \ - --samples "$samples" \ - --timeout-ms 240000 \ - --sample-timeout-ms 30000 + pnpm test:docker:npm-telegram-live status="$?" set -e - result_path="$(find "$RUNNER_TEMP/openclaw-rtt-runs" -maxdepth 3 -name result.json -type f | sort | tail -1)" - if [[ -z "$result_path" ]]; then - echo "No RTT result.json produced." >&2 + finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')" + evidence_path="$output/raw/qa-evidence.json" + if [[ ! -f "$evidence_path" ]]; then + echo "No qa-evidence.json produced." >&2 exit "$status" fi - echo "result_path=$result_path" >>"$GITHUB_OUTPUT" - echo "resource_metrics_path=$metrics_path" >>"$GITHUB_OUTPUT" + { + echo "evidence_path=$evidence_path" + echo "resource_metrics_path=$metrics_path" + echo "started_at=$started_at" + echo "finished_at=$finished_at" + } >>"$GITHUB_OUTPUT" - name: Import result working-directory: openclaw-rtt @@ -143,7 +157,10 @@ jobs: run: | set -euo pipefail git pull --rebase origin main - node scripts/import-result.mjs "${{ steps.rtt.outputs.result_path }}" \ + node scripts/import-result.mjs "${{ steps.rtt.outputs.evidence_path }}" \ + --version "${{ steps.pack.outputs.package_version }}" \ + --started-at "${{ steps.rtt.outputs.started_at }}" \ + --finished-at "${{ steps.rtt.outputs.finished_at }}" \ --resource-metrics "${{ steps.rtt.outputs.resource_metrics_path }}" node scripts/validate.mjs node scripts/summary.mjs diff --git a/.github/workflows/stable-release-rtt.yml b/.github/workflows/stable-release-rtt.yml index 11e8929a..40e7ea2e 100644 --- a/.github/workflows/stable-release-rtt.yml +++ b/.github/workflows/stable-release-rtt.yml @@ -131,32 +131,34 @@ jobs: shell: bash run: | set -euo pipefail - for spec in ${{ steps.release.outputs.specs }}; do + read -r -a specs <<< "${{ steps.release.outputs.specs }}" + read -r -a versions <<< "${{ steps.release.outputs.versions }}" + for index in "${!specs[@]}"; do + spec="${specs[$index]}" + version="${versions[$index]}" output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')" metrics_path="${output}/resource-metrics.env" mkdir -p "$output" set +e - /usr/bin/time \ + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ + /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ timeout --kill-after=30s 10m \ - pnpm rtt "$spec" \ - --harness-root "$PWD" \ - --output "$output" \ - --samples 20 \ - --timeout-ms 240000 \ - --sample-timeout-ms 30000 + pnpm test:docker:npm-telegram-live status="$?" set -e echo "exit_status=${status}" >>"$metrics_path" - result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)" - if [[ -z "$result_path" ]]; then - echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2 + evidence_path="$output/raw/qa-evidence.json" + if [[ ! -f "$evidence_path" ]]; then + echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2 continue fi - run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")" + run_status="$(node -e 'const fs = require("node:fs"); const evidence = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); const failed = !Array.isArray(evidence.entries) || evidence.entries.some((entry) => entry?.result?.status !== "pass"); process.stdout.write(failed ? "fail" : "pass");' "$evidence_path")" if [[ "$run_status" != "pass" ]]; then - echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2 + echo "Skipping RSS backfill for failed rerun ${evidence_path} (status=${run_status:-missing})." >&2 continue fi ( @@ -164,7 +166,8 @@ jobs: git pull --rebase origin "${GITHUB_REF_NAME:-main}" node scripts/backfill-release-rss.mjs \ --family telegram \ - --result "$result_path" \ + --spec "$spec" \ + --version "$version" \ --resource-metrics "$metrics_path" node scripts/validate.mjs git add data/channels/telegram/ runs/telegram/ @@ -195,28 +198,32 @@ jobs: set -euo pipefail result_paths="$RUNNER_TEMP/openclaw-rtt-result-paths.txt" : >"$result_paths" - for spec in ${{ steps.release.outputs.specs }}; do + read -r -a specs <<< "${{ steps.release.outputs.specs }}" + read -r -a versions <<< "${{ steps.release.outputs.versions }}" + for index in "${!specs[@]}"; do + spec="${specs[$index]}" + version="${versions[$index]}" output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')" metrics_path="${output}/resource-metrics.env" mkdir -p "$output" + started_at="$(node -e 'process.stdout.write(new Date().toISOString())')" set +e - /usr/bin/time \ + OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \ + OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \ + OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \ + /usr/bin/time \ -f 'max_rss_kb=%M\nelapsed_seconds=%e' \ -o "$metrics_path" \ - pnpm rtt "$spec" \ - --harness-root "$PWD" \ - --output "$output" \ - --samples 20 \ - --timeout-ms 240000 \ - --sample-timeout-ms 30000 + pnpm test:docker:npm-telegram-live status="$?" set -e + finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')" echo "exit_status=${status}" >>"$metrics_path" - result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)" - if [[ -z "$result_path" ]]; then - echo "No RTT result.json produced for $spec." >&2 + evidence_path="$output/raw/qa-evidence.json" + if [[ ! -f "$evidence_path" ]]; then + echo "No qa-evidence.json produced for $spec." >&2 if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then - echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2 + echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2 continue fi if [[ "$status" -eq 0 ]]; then @@ -224,7 +231,7 @@ jobs: fi exit "$status" fi - printf '%s\t%s\n' "$result_path" "$metrics_path" >>"$result_paths" + printf '%s\t%s\t%s\t%s\t%s\n' "$evidence_path" "$metrics_path" "$version" "$started_at" "$finished_at" >>"$result_paths" done echo "result_paths=$result_paths" >>"$GITHUB_OUTPUT" @@ -235,20 +242,12 @@ jobs: run: | set -euo pipefail git pull --rebase origin "${GITHUB_REF_NAME:-main}" - while IFS=$'\t' read -r result_path metrics_path; do - if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then - run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")" - if [[ "$run_status" != "pass" ]]; then - echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2 - continue - fi - node scripts/backfill-release-rss.mjs \ - --family telegram \ - --result "$result_path" \ - --resource-metrics "$metrics_path" - else - node scripts/import-result.mjs "$result_path" --resource-metrics "$metrics_path" - fi + while IFS=$'\t' read -r evidence_path metrics_path version started_at finished_at; do + node scripts/import-result.mjs "$evidence_path" \ + --version "$version" \ + --started-at "$started_at" \ + --finished-at "$finished_at" \ + --resource-metrics "$metrics_path" done <"${{ steps.rtt.outputs.result_paths }}" node scripts/validate.mjs node scripts/summary.mjs diff --git a/README.md b/README.md index cb10fe77..7a0a86a7 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ Latest imported surface run: `2026-06-12T13:45:18.208Z` ## Telegram Release Runs -Telegram release runs use the OpenClaw repo black-box harness on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target normal-reply samples, a 240s canary timeout, and a 30s per-sample timeout. +Telegram release runs use the OpenClaw package Telegram live QA lane on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target RTT checks, a 240s scenario timeout, and a 30s per-check timeout. New rows import aggregate timing from `qa-evidence.json`; older rows imported by the retired package RTT wrapper keep their historical per-sample arrays. The system under test is the published package running its own Telegram bot. The OpenClaw repo only supplies the mock model server and Telegram driver. `p50` is the median normal-reply RTT. Log notes: [2026-05-02 Testbox stable sweep](logs/2026-05-02-testbox-stable-sweep.md). diff --git a/docs/channel-expansion.md b/docs/channel-expansion.md index f6bf351c..496df682 100644 --- a/docs/channel-expansion.md +++ b/docs/channel-expansion.md @@ -5,7 +5,7 @@ ## Current State - All imported rows use `data/channels//.jsonl` and `runs///result.json`. -- Telegram main/release RTT still uses the older `pnpm rtt` source shape, but it now writes through the shared Telegram channel storage path. +- Telegram main/release RTT uses the OpenClaw package Telegram live lane and imports aggregate timing from `qa-evidence.json` through the shared Telegram channel storage path. - Discord main/release RTT uses the live QA lane with a specialized importer because its summary currently needs observed-message timestamp fallback. - Slack and WhatsApp main RTT use the reusable live-transport importer. - The Discord release resolver backfills missing versions from the Telegram release baseline before measuring future versions. It skips releases that predate or fail the Discord canary contract instead of reporting them as runnable gaps. @@ -37,7 +37,7 @@ Each sample is wrapped with `/usr/bin/time` and imports process max RSS in kilob Discord is intentionally not migrated to the generic live-transport importer yet. Its summary currently omits RTT fields, so the generic importer supports observed-message timestamp fallback and has test coverage for that path, but the existing Discord workflow remains stable while the new channel lane proves itself. -Telegram is listed in the channel config for the future live-transport path, but the current production graph remains on the older `pnpm rtt` package-result path because that is what release sweeps already use. +Telegram is listed in the channel config because the package Telegram live lane now emits QA evidence for `telegram-mentioned-message-reply`. Future Telegram rows keep the same dashboard metrics as older rows, but the source is aggregate `qa-evidence.json` timing rather than the retired package RTT wrapper's per-sample result JSON. Do not read cross-channel values as pure transport rankings. Telegram release rows use `telegram-mentioned-message-reply`; Discord, Slack, and WhatsApp rows use canary scenarios. The live-transport lane also includes QA-lab process overhead in RSS because the measured process is `pnpm openclaw qa `, not only the channel adapter. diff --git a/docs/data-imports.md b/docs/data-imports.md index c8b688f2..ad3d75a3 100644 --- a/docs/data-imports.md +++ b/docs/data-imports.md @@ -7,8 +7,8 @@ Run importers from the repo root: ```sh -node scripts/import-result.mjs ../clawdbot/runs//result.json -node scripts/import-result.mjs ../clawdbot/runs//result.json --resource-metrics resource-metrics.env +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --version --started-at --finished-at +node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live//qa-evidence.json --version --started-at --finished-at --resource-metrics resource-metrics.env node scripts/import-discord-rtt.mjs samples.tsv --spec openclaw@main --version (cd ../openclaw && node --import tsx ../openclaw-rtt/scripts/measure-rpc-rtt.mjs --output-dir ../openclaw-rtt/.artifacts/rpc-rtt/sample-1) node scripts/import-surface-rtt.mjs rpc-samples.tsv --surface rpc --spec openclaw@main --version --provider-mode gateway-rpc --scenario rpc-gateway-smoke --require-pass @@ -18,13 +18,11 @@ node scripts/backfill-release-rss.mjs --family discord --spec openclaw@2026.5.16 node scripts/summary.mjs ``` -Telegram release imports expect the `result.json` shape emitted by: +Telegram release imports expect the aggregate `qa-evidence.json` shape emitted by the OpenClaw package Telegram live lane. The OpenClaw harness checkout must include the package Telegram RTT evidence path; older OpenClaw packages can still be the system under test. ```sh -pnpm rtt openclaw@beta -pnpm rtt openclaw@beta --samples 20 -pnpm rtt openclaw@latest -pnpm rtt openclaw@2026.4.30 --provider live-frontier +OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \ +pnpm test:docker:npm-telegram-live ``` ## Data Layout @@ -36,7 +34,7 @@ pnpm rtt openclaw@2026.4.30 --provider live-frontier Current channel folders are `telegram`, `discord`, `slack`, and `whatsapp`. Telegram and Discord still have specialized importers because their source artifact shapes differ; they now share the same storage contract as generic live-transport channels. -Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later. +Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later. Historical Telegram rows imported from the old package RTT wrapper may include `rtt.warmSamples` with every successful sample value. New Telegram imports from `qa-evidence.json` preserve the aggregate dashboard metrics (`canaryMs`, `mentionReplyMs`, `avgMs`, `p50Ms`, `p95Ms`, `maxMs`, `sampleCount`, and `failedSamples`) but do not reconstruct individual sample RTT arrays because the evidence artifact stores aggregate timing. Release RSS backfills only write `resources` onto an existing Telegram or Discord row and its copied `result.json`. The backfill command asserts the stored RTT `p50` and `p95` values are unchanged before it rewrites that version's JSONL file. RSS is process-level data around the sampled command, not isolated channel transport memory. diff --git a/scripts/backfill-release-rss.mjs b/scripts/backfill-release-rss.mjs index 6ba1d62d..7eb990cf 100644 --- a/scripts/backfill-release-rss.mjs +++ b/scripts/backfill-release-rss.mjs @@ -78,7 +78,10 @@ async function readResources(args) { const measurement = { kind: "process-max-rss", scope: args.family === "telegram" ? "release-harness-command" : "qa-command", - command: args.family === "telegram" ? "pnpm rtt" : `pnpm openclaw qa ${args.family}`, + command: + args.family === "telegram" + ? "pnpm test:docker:npm-telegram-live" + : `pnpm openclaw qa ${args.family}`, }; if (args.resourceMetricsPath) { return aggregateResources([await readResourceMetrics(path.resolve(args.resourceMetricsPath))], measurement); diff --git a/scripts/backfill-release-rss.test.mjs b/scripts/backfill-release-rss.test.mjs index 138cc687..dccce50d 100644 --- a/scripts/backfill-release-rss.test.mjs +++ b/scripts/backfill-release-rss.test.mjs @@ -60,7 +60,7 @@ test("backfills Telegram RSS without touching RTT p50/p95", async () => { assert.deepEqual(updated.resources.measurement, { kind: "process-max-rss", scope: "release-harness-command", - command: "pnpm rtt", + command: "pnpm test:docker:npm-telegram-live", }); assert.deepEqual(updated.resources.maxRssKbSamples, [409600]); assert.equal(updated.resources.maxRssKb.max, 409600); diff --git a/scripts/import-result.mjs b/scripts/import-result.mjs index 332b4e89..8acc6b0f 100644 --- a/scripts/import-result.mjs +++ b/scripts/import-result.mjs @@ -16,8 +16,9 @@ const TELEGRAM_CHANNEL = { function usage() { return [ - "Usage: node scripts/import-result.mjs ", + "Usage: node scripts/import-result.mjs ", " [--resource-metrics ]", + " --version --started-at --finished-at ", ].join("\n"); } @@ -33,6 +34,18 @@ function parseArgs(argv) { args.resourceMetricsPath = argv[(index += 1)]; continue; } + if (arg === "--version") { + args.version = argv[(index += 1)]; + continue; + } + if (arg === "--started-at") { + args.startedAt = argv[(index += 1)]; + continue; + } + if (arg === "--finished-at") { + args.finishedAt = argv[(index += 1)]; + continue; + } throw new Error(`Unknown argument: ${arg}\n${usage()}`); } if (!args.sourcePath) { @@ -62,49 +75,154 @@ function requireNumber(value, label) { return value; } -function validateOptionalNumber(value, label) { - if (value !== undefined) { - requireNumber(value, label); - } -} - -function validateResult(value) { - const result = requireObject(value, "result"); - const packageInfo = requireObject(result.package, "result.package"); - const run = requireObject(result.run, "result.run"); - const mode = requireObject(result.mode, "result.mode"); - const rtt = requireObject(result.rtt, "result.rtt"); - - requireString(packageInfo.spec, "result.package.spec"); - requireString(packageInfo.version, "result.package.version"); - requireString(run.id, "result.run.id"); - requireString(run.startedAt, "result.run.startedAt"); - requireString(run.finishedAt, "result.run.finishedAt"); - requireNumber(run.durationMs, "result.run.durationMs"); - if (run.status !== "pass" && run.status !== "fail") { - throw new Error("result.run.status must be pass or fail."); - } - requireString(mode.providerMode, "result.mode.providerMode"); - if (!Array.isArray(mode.scenarios)) { - throw new Error("result.mode.scenarios must be an array."); - } - validateOptionalNumber(rtt.canaryMs, "result.rtt.canaryMs"); - validateOptionalNumber(rtt.mentionReplyMs, "result.rtt.mentionReplyMs"); - validateOptionalNumber(rtt.avgMs, "result.rtt.avgMs"); - validateOptionalNumber(rtt.p50Ms, "result.rtt.p50Ms"); - validateOptionalNumber(rtt.p95Ms, "result.rtt.p95Ms"); - validateOptionalNumber(rtt.maxMs, "result.rtt.maxMs"); - validateOptionalNumber(rtt.failedSamples, "result.rtt.failedSamples"); - if (rtt.warmSamples !== undefined) { - if (!Array.isArray(rtt.warmSamples)) { - throw new Error("result.rtt.warmSamples must be an array."); - } - rtt.warmSamples.forEach((sample, index) => { - requireNumber(sample, `result.rtt.warmSamples[${index}]`); - }); +function safeRunLabel(input) { + return input.replace(/[^a-zA-Z0-9.-]+/gu, "_").replace(/^_+|_+$/gu, ""); +} + +function buildEvidenceRunId(startedAt, spec) { + return [ + startedAt.replaceAll(":", "").replaceAll(".", ""), + safeRunLabel(spec), + "telegram", + TELEGRAM_CHANNEL.scenario, + "rtt", + ].join("-"); +} + +function requireEvidenceArgs(args) { + requireString(args.version, "--version"); + requireString(args.startedAt, "--started-at"); + requireString(args.finishedAt, "--finished-at"); + const startedAtMs = Date.parse(args.startedAt); + const finishedAtMs = Date.parse(args.finishedAt); + if (!Number.isFinite(startedAtMs)) { + throw new Error("--started-at must be a parseable ISO timestamp."); + } + if (!Number.isFinite(finishedAtMs)) { + throw new Error("--finished-at must be a parseable ISO timestamp."); + } + if (finishedAtMs < startedAtMs) { + throw new Error("--finished-at must be at or after --started-at."); + } + return { finishedAtMs, startedAtMs }; +} + +function evidenceEntry(evidence, testId) { + const entries = Array.isArray(evidence.entries) ? evidence.entries : []; + const entry = entries.find((candidate) => candidate?.test?.id === testId); + if (!entry) { + const available = entries.map((candidate) => candidate?.test?.id).filter(Boolean).join(", "); + throw new Error(`qa evidence missing ${testId}; available: ${available || ""}`); } + return entry; +} - return result; +function packageSpecFromEvidence(entry) { + const spec = entry?.execution?.packageSource?.spec; + return requireString(spec, "qa evidence execution.packageSource.spec"); +} + +function readTiming(entry, label) { + const timing = entry?.result?.timing; + if (!timing || typeof timing !== "object" || Array.isArray(timing)) { + throw new Error(`${label} is missing result.timing.`); + } + return timing; +} + +function finiteTimingNumber(timing, name) { + const value = timing[name]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function requirePositiveTimingNumber(timing, name, label) { + const value = finiteTimingNumber(timing, name); + if (value === undefined || value <= 0) { + throw new Error(`${label} must include positive result.timing.${name}.`); + } + return value; +} + +function providerModeFromEvidence(entry) { + const provider = entry?.execution?.provider; + if (!provider || typeof provider !== "object" || Array.isArray(provider)) { + return "unknown"; + } + if (typeof provider.auth === "string" && provider.auth.trim()) { + return provider.auth; + } + if (typeof provider.fixture === "string" && provider.fixture.trim()) { + return provider.fixture; + } + return provider.live === true ? "live-frontier" : "mock-openai"; +} + +function statusFromEvidence(entries) { + return entries.every((entry) => entry?.result?.status === "pass") ? "pass" : "fail"; +} + +function buildResultFromEvidence(evidence, args) { + if (evidence.kind !== "openclaw.qa.evidence-summary") { + throw new Error("input must be an OpenClaw qa-evidence.json summary."); + } + const { finishedAtMs, startedAtMs } = requireEvidenceArgs(args); + const canary = evidenceEntry(evidence, "telegram-canary"); + const mention = evidenceEntry(evidence, TELEGRAM_CHANNEL.scenario); + const packageSpec = packageSpecFromEvidence(mention); + const canaryTiming = readTiming(canary, "telegram-canary"); + const mentionTiming = readTiming(mention, TELEGRAM_CHANNEL.scenario); + const canaryMs = finiteTimingNumber(canaryTiming, "rttMs"); + const sampleCount = requirePositiveTimingNumber( + mentionTiming, + "samples", + TELEGRAM_CHANNEL.scenario, + ); + const mentionReplyMs = finiteTimingNumber(mentionTiming, "p50Ms"); + const failedSamples = finiteTimingNumber(mentionTiming, "failedSamples"); + return { + package: { + spec: packageSpec, + version: args.version, + }, + run: { + id: buildEvidenceRunId(args.startedAt, packageSpec), + startedAt: args.startedAt, + finishedAt: args.finishedAt, + durationMs: finishedAtMs - startedAtMs, + status: + statusFromEvidence([canary, mention]) === "pass" && + typeof canaryMs === "number" && + typeof mentionReplyMs === "number" + ? "pass" + : "fail", + }, + mode: { + providerMode: providerModeFromEvidence(mention), + scenarios: [TELEGRAM_CHANNEL.scenario], + source: "qa-evidence", + }, + rtt: { + canaryMs, + mentionReplyMs, + avgMs: finiteTimingNumber(mentionTiming, "avgMs"), + p50Ms: mentionReplyMs, + p95Ms: finiteTimingNumber(mentionTiming, "p95Ms"), + maxMs: finiteTimingNumber(mentionTiming, "maxMs"), + failedSamples, + sampleCount, + sources: ["qa-evidence"], + }, + samples: [ + { + index: 1, + status: mention?.result?.status === "pass" ? "pass" : "fail", + details: `aggregate timing from qa-evidence.json (${Math.max( + 0, + sampleCount - (failedSamples ?? 0), + )}/${sampleCount} samples passed)`, + }, + ], + }; } async function readJson(pathname) { @@ -116,17 +234,26 @@ async function existingRunIds() { return new Set([...ids].filter(Boolean)); } +function resourceMeasurementForResult(result) { + return { + kind: "process-max-rss", + scope: "release-harness-command", + command: "pnpm test:docker:npm-telegram-live", + }; +} + async function main() { const args = parseArgs(process.argv.slice(2)); - const result = validateResult(await readJson(path.resolve(args.sourcePath))); + const input = await readJson(path.resolve(args.sourcePath)); + const result = buildResultFromEvidence(requireObject(input, "input"), args); const seen = await existingRunIds(); if (seen.has(result.run.id)) { throw new Error(`Run already imported: ${result.run.id}`); } if (args.resourceMetricsPath) { const resourceMetrics = await readResourceMetrics(path.resolve(args.resourceMetricsPath)); - result.resources = aggregateResources([resourceMetrics]); + result.resources = aggregateResources([resourceMetrics], resourceMeasurementForResult(result)); } result.channel = { ...TELEGRAM_CHANNEL, diff --git a/scripts/import-result.test.mjs b/scripts/import-result.test.mjs new file mode 100644 index 00000000..1d6c7f5b --- /dev/null +++ b/scripts/import-result.test.mjs @@ -0,0 +1,212 @@ +import assert from "node:assert/strict"; +import { execFile } from "node:child_process"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { promisify } from "node:util"; +import test from "node:test"; + +const execFileAsync = promisify(execFile); +const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), ".."); +const IMPORT_SCRIPT = path.join(REPO_ROOT, "scripts/import-result.mjs"); + +async function makeWorkspace() { + return await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-rtt-import-result-test-")); +} + +async function writeJson(pathname, value) { + await fs.mkdir(path.dirname(pathname), { recursive: true }); + await fs.writeFile(pathname, `${JSON.stringify(value, null, 2)}\n`); +} + +async function readJsonl(pathname) { + const text = await fs.readFile(pathname, "utf8"); + return text + .split("\n") + .filter(Boolean) + .map((line) => JSON.parse(line)); +} + +test("imports Telegram qa-evidence as the existing RTT row shape", async () => { + const workspace = await makeWorkspace(); + const evidencePath = path.join(workspace, "qa-evidence.json"); + const metricsPath = path.join(workspace, "resource-metrics.env"); + await writeJson(evidencePath, { + kind: "openclaw.qa.evidence-summary", + schemaVersion: 2, + generatedAt: "2026-06-12T20:00:20.000Z", + entries: [ + { + test: { + kind: "live-transport-check", + id: "telegram-canary", + title: "Telegram canary", + }, + execution: { + packageSource: { + kind: "npm-package", + spec: "openclaw@main", + }, + provider: { + id: "openai", + live: false, + fixture: "mock-openai", + }, + }, + result: { + status: "pass", + timing: { + rttMs: 900, + }, + }, + }, + { + test: { + kind: "live-transport-check", + id: "telegram-mentioned-message-reply", + title: "Telegram mentioned message gets a reply", + }, + execution: { + packageSource: { + kind: "npm-package", + spec: "openclaw@main", + }, + provider: { + id: "openai", + live: false, + fixture: "mock-openai", + }, + }, + result: { + status: "pass", + timing: { + rttMs: 1200, + avgMs: 1300, + p50Ms: 1200, + p95Ms: 1800, + maxMs: 2200, + samples: 5, + failedSamples: 1, + }, + }, + }, + ], + }); + await fs.writeFile(metricsPath, "max_rss_kb=204800\nelapsed_seconds=22.5\n"); + + await execFileAsync( + process.execPath, + [ + IMPORT_SCRIPT, + evidencePath, + "--version", + "2026.6.2+abcdef1234", + "--started-at", + "2026-06-12T20:00:00.000Z", + "--finished-at", + "2026-06-12T20:00:30.000Z", + "--resource-metrics", + metricsPath, + ], + { cwd: workspace }, + ); + + const [row] = await readJsonl( + path.join(workspace, "data/channels/telegram/2026.6.2+abcdef1234.jsonl"), + ); + assert.deepEqual(row.channel, { + id: "telegram", + label: "Telegram", + scenario: "telegram-mentioned-message-reply", + }); + assert.equal(row.package.spec, "openclaw@main"); + assert.equal(row.package.version, "2026.6.2+abcdef1234"); + assert.equal(row.run.status, "pass"); + assert.equal(row.run.durationMs, 30_000); + assert.equal(row.mode.providerMode, "mock-openai"); + assert.equal(row.mode.source, "qa-evidence"); + assert.equal(row.rtt.canaryMs, 900); + assert.equal(row.rtt.mentionReplyMs, 1200); + assert.equal(row.rtt.avgMs, 1300); + assert.equal(row.rtt.p50Ms, 1200); + assert.equal(row.rtt.p95Ms, 1800); + assert.equal(row.rtt.maxMs, 2200); + assert.equal(row.rtt.sampleCount, 5); + assert.equal(row.rtt.failedSamples, 1); + assert.deepEqual(row.rtt.sources, ["qa-evidence"]); + assert.deepEqual(row.resources.measurement, { + kind: "process-max-rss", + scope: "release-harness-command", + command: "pnpm test:docker:npm-telegram-live", + }); + assert.deepEqual(row.resources.maxRssKbSamples, [204800]); + assert.equal(row.resources.maxRssKb.p50, 204800); + assert.deepEqual(row.resources.elapsedSecondsSamples, [22.5]); + assert.deepEqual(row.artifacts, { + resultPath: row.artifacts.resultPath, + }); + assert.match(row.artifacts.resultPath, /^runs\/telegram\/.+\/result\.json$/u); +}); + +test("rejects qa-evidence without aggregate Telegram RTT samples", async () => { + const workspace = await makeWorkspace(); + const evidencePath = path.join(workspace, "qa-evidence.json"); + await writeJson(evidencePath, { + kind: "openclaw.qa.evidence-summary", + schemaVersion: 2, + generatedAt: "2026-06-12T20:00:20.000Z", + entries: [ + { + test: { + kind: "live-transport-check", + id: "telegram-canary", + title: "Telegram canary", + }, + result: { + status: "pass", + timing: { + rttMs: 900, + }, + }, + }, + { + test: { + kind: "live-transport-check", + id: "telegram-mentioned-message-reply", + title: "Telegram mentioned message gets a reply", + }, + execution: { + packageSource: { + kind: "npm-package", + spec: "openclaw@main", + }, + }, + result: { + status: "pass", + timing: { + rttMs: 1200, + }, + }, + }, + ], + }); + + await assert.rejects( + execFileAsync( + process.execPath, + [ + IMPORT_SCRIPT, + evidencePath, + "--version", + "2026.6.2+abcdef1234", + "--started-at", + "2026-06-12T20:00:00.000Z", + "--finished-at", + "2026-06-12T20:00:30.000Z", + ], + { cwd: workspace }, + ), + /telegram-mentioned-message-reply must include positive result\.timing\.samples/u, + ); +});