openclaw · vincentkoc · Jun 14, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 13, 2026
@@ -98,7 +98,9 @@ jobs:
             echo "No package tgz produced." >&2
             exit 1
           fi
+          package_version="$(node -e 'const fs = require("node:fs"); process.stdout.write(JSON.parse(fs.readFileSync("package.json", "utf8")).version)')+$(git rev-parse --short=10 HEAD)"
           echo "package_tgz=${PWD}/${package_tgz}" >>"$GITHUB_OUTPUT"
+          echo "package_version=$package_version" >>"$GITHUB_OUTPUT"
 
       - name: Run RTT
         id: rtt
@@ -115,35 +117,50 @@ jobs:
             echo "samples must be a positive integer, got: $samples" >&2
             exit 1
           fi
+          output="$RUNNER_TEMP/openclaw-rtt-runs/main"
+          rtt_env=(
+            OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="openclaw@main"
+            OPENCLAW_NPM_TELEGRAM_PACKAGE_TGZ="${{ steps.pack.outputs.package_tgz }}"
+            OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw"
+            OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000
+          )
+          if [[ "$samples" != "20" ]]; then
+            rtt_env+=(OPENCLAW_NPM_TELEGRAM_RTT_SAMPLES="$samples")
+          fi
           metrics_path="$RUNNER_TEMP/openclaw-rtt-resource-metrics.env"
+          started_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
           set +e
-          /usr/bin/time \
+          env \
+            "${rtt_env[@]}" \
+            /usr/bin/time \
             -f 'max_rss_kb=%M\nelapsed_seconds=%e' \
             -o "$metrics_path" \
-            pnpm rtt openclaw@main \
-            --package-tgz "${{ steps.pack.outputs.package_tgz }}" \
-            --harness-root "$PWD" \
-            --output "$RUNNER_TEMP/openclaw-rtt-runs" \
-            --samples "$samples" \
-            --timeout-ms 240000 \
-            --sample-timeout-ms 30000
+            pnpm test:docker:npm-telegram-live
           status="$?"
           set -e
-          result_path="$(find "$RUNNER_TEMP/openclaw-rtt-runs" -maxdepth 3 -name result.json -type f | sort | tail -1)"
-          if [[ -z "$result_path" ]]; then
-            echo "No RTT result.json produced." >&2
+          finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
+          evidence_path="$output/raw/qa-evidence.json"
+          if [[ ! -f "$evidence_path" ]]; then
+            echo "No qa-evidence.json produced." >&2
             exit "$status"
           fi
-          echo "result_path=$result_path" >>"$GITHUB_OUTPUT"
-          echo "resource_metrics_path=$metrics_path" >>"$GITHUB_OUTPUT"
+          {
+            echo "evidence_path=$evidence_path"
+            echo "resource_metrics_path=$metrics_path"
+            echo "started_at=$started_at"
+            echo "finished_at=$finished_at"
+          } >>"$GITHUB_OUTPUT"
 
       - name: Import result
         working-directory: openclaw-rtt
         shell: bash
         run: |
           set -euo pipefail
           git pull --rebase origin main
-          node scripts/import-result.mjs "${{ steps.rtt.outputs.result_path }}" \
+          node scripts/import-result.mjs "${{ steps.rtt.outputs.evidence_path }}" \
+            --version "${{ steps.pack.outputs.package_version }}" \
+            --started-at "${{ steps.rtt.outputs.started_at }}" \
+            --finished-at "${{ steps.rtt.outputs.finished_at }}" \
             --resource-metrics "${{ steps.rtt.outputs.resource_metrics_path }}"
           node scripts/validate.mjs
           node scripts/summary.mjs

@@ -131,40 +131,43 @@ jobs:
         shell: bash
         run: |
           set -euo pipefail
-          for spec in ${{ steps.release.outputs.specs }}; do
+          read -r -a specs <<< "${{ steps.release.outputs.specs }}"
+          read -r -a versions <<< "${{ steps.release.outputs.versions }}"
+          for index in "${!specs[@]}"; do
+            spec="${specs[$index]}"
+            version="${versions[$index]}"
             output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')"
             metrics_path="${output}/resource-metrics.env"
             mkdir -p "$output"
             set +e
-            /usr/bin/time \
+            OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \
+            OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \
+            OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \
+              /usr/bin/time \
               -f 'max_rss_kb=%M\nelapsed_seconds=%e' \
               -o "$metrics_path" \
               timeout --kill-after=30s 10m \
-              pnpm rtt "$spec" \
-              --harness-root "$PWD" \
-              --output "$output" \
-              --samples 20 \
-              --timeout-ms 240000 \
-              --sample-timeout-ms 30000
+              pnpm test:docker:npm-telegram-live
             status="$?"
             set -e
             echo "exit_status=${status}" >>"$metrics_path"
-            result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)"
-            if [[ -z "$result_path" ]]; then
-              echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2
+            evidence_path="$output/raw/qa-evidence.json"
+            if [[ ! -f "$evidence_path" ]]; then
+              echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2
               continue
             fi
-            run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")"
+            run_status="$(node -e 'const fs = require("node:fs"); const evidence = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); const failed = !Array.isArray(evidence.entries) || evidence.entries.some((entry) => entry?.result?.status !== "pass"); process.stdout.write(failed ? "fail" : "pass");' "$evidence_path")"
             if [[ "$run_status" != "pass" ]]; then
-              echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2
+              echo "Skipping RSS backfill for failed rerun ${evidence_path} (status=${run_status:-missing})." >&2
               continue
             fi
             (
               cd ../openclaw-rtt
               git pull --rebase origin "${GITHUB_REF_NAME:-main}"
               node scripts/backfill-release-rss.mjs \
                 --family telegram \
-                --result "$result_path" \
+                --spec "$spec" \
+                --version "$version" \
                 --resource-metrics "$metrics_path"
               node scripts/validate.mjs
               git add data/channels/telegram/ runs/telegram/
@@ -195,36 +198,40 @@ jobs:
           set -euo pipefail
           result_paths="$RUNNER_TEMP/openclaw-rtt-result-paths.txt"
           : >"$result_paths"
-          for spec in ${{ steps.release.outputs.specs }}; do
+          read -r -a specs <<< "${{ steps.release.outputs.specs }}"
+          read -r -a versions <<< "${{ steps.release.outputs.versions }}"
+          for index in "${!specs[@]}"; do
+            spec="${specs[$index]}"
+            version="${versions[$index]}"
             output="$RUNNER_TEMP/openclaw-rtt-runs/$(printf '%s' "$spec" | tr -c 'A-Za-z0-9._-' '_')"
             metrics_path="${output}/resource-metrics.env"
             mkdir -p "$output"
+            started_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
             set +e
-            /usr/bin/time \
+            OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC="$spec" \
+            OPENCLAW_NPM_TELEGRAM_OUTPUT_DIR="$output/raw" \
+            OPENCLAW_QA_TELEGRAM_SCENARIO_TIMEOUT_MS=240000 \
+              /usr/bin/time \
               -f 'max_rss_kb=%M\nelapsed_seconds=%e' \
               -o "$metrics_path" \
-              pnpm rtt "$spec" \
-              --harness-root "$PWD" \
-              --output "$output" \
-              --samples 20 \
-              --timeout-ms 240000 \
-              --sample-timeout-ms 30000
+              pnpm test:docker:npm-telegram-live
             status="$?"
             set -e
+            finished_at="$(node -e 'process.stdout.write(new Date().toISOString())')"
             echo "exit_status=${status}" >>"$metrics_path"
-            result_path="$(find "$output" -maxdepth 3 -name result.json -type f | sort | tail -1)"
-            if [[ -z "$result_path" ]]; then
-              echo "No RTT result.json produced for $spec." >&2
+            evidence_path="$output/raw/qa-evidence.json"
+            if [[ ! -f "$evidence_path" ]]; then
+              echo "No qa-evidence.json produced for $spec." >&2
               if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then
-                echo "Skipping RSS backfill for $spec because the rerun did not produce a result." >&2
+                echo "Skipping RSS backfill for $spec because the rerun did not produce qa-evidence.json." >&2
                 continue
               fi
               if [[ "$status" -eq 0 ]]; then
                 status=1
               fi
               exit "$status"
             fi
-            printf '%s\t%s\n' "$result_path" "$metrics_path" >>"$result_paths"
+            printf '%s\t%s\t%s\t%s\t%s\n' "$evidence_path" "$metrics_path" "$version" "$started_at" "$finished_at" >>"$result_paths"
           done
           echo "result_paths=$result_paths" >>"$GITHUB_OUTPUT"
 
@@ -235,20 +242,12 @@ jobs:
         run: |
           set -euo pipefail
           git pull --rebase origin "${GITHUB_REF_NAME:-main}"
-          while IFS=$'\t' read -r result_path metrics_path; do
-            if [[ "${{ steps.release.outputs.rss_backfill }}" == "true" ]]; then
-              run_status="$(node -e 'const fs = require("node:fs"); const result = JSON.parse(fs.readFileSync(process.argv[1], "utf8")); process.stdout.write(String(result.run?.status || ""));' "$result_path")"
-              if [[ "$run_status" != "pass" ]]; then
-                echo "Skipping RSS backfill for failed rerun ${result_path} (status=${run_status:-missing})." >&2
-                continue
-              fi
-              node scripts/backfill-release-rss.mjs \
-                --family telegram \
-                --result "$result_path" \
-                --resource-metrics "$metrics_path"
-            else
-              node scripts/import-result.mjs "$result_path" --resource-metrics "$metrics_path"
-            fi
+          while IFS=$'\t' read -r evidence_path metrics_path version started_at finished_at; do
+            node scripts/import-result.mjs "$evidence_path" \
+              --version "$version" \
+              --started-at "$started_at" \
+              --finished-at "$finished_at" \
+              --resource-metrics "$metrics_path"
           done <"${{ steps.rtt.outputs.result_paths }}"
           node scripts/validate.mjs
           node scripts/summary.mjs

diff --git a/README.md b/README.md
@@ -213,7 +213,7 @@ Latest imported surface run: `2026-06-12T13:45:18.208Z`
 
 ## Telegram Release Runs
 
-Telegram release runs use the OpenClaw repo black-box harness on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target normal-reply samples, a 240s canary timeout, and a 30s per-sample timeout.
+Telegram release runs use the OpenClaw package Telegram live QA lane on Blacksmith with `mock-openai`, scenario `telegram-mentioned-message-reply`, 20 target RTT checks, a 240s scenario timeout, and a 30s per-check timeout. New rows import aggregate timing from `qa-evidence.json`; older rows imported by the retired package RTT wrapper keep their historical per-sample arrays.
 
 The system under test is the published package running its own Telegram bot. The OpenClaw repo only supplies the mock model server and Telegram driver. `p50` is the median normal-reply RTT. Log notes: [2026-05-02 Testbox stable sweep](logs/2026-05-02-testbox-stable-sweep.md).
 

diff --git a/docs/channel-expansion.md b/docs/channel-expansion.md
@@ -5,7 +5,7 @@
 ## Current State
 
 - All imported rows use `data/channels/<channel>/<version>.jsonl` and `runs/<channel>/<run-id>/result.json`.
-- Telegram main/release RTT still uses the older `pnpm rtt` source shape, but it now writes through the shared Telegram channel storage path.
+- Telegram main/release RTT uses the OpenClaw package Telegram live lane and imports aggregate timing from `qa-evidence.json` through the shared Telegram channel storage path.
 - Discord main/release RTT uses the live QA lane with a specialized importer because its summary currently needs observed-message timestamp fallback.
 - Slack and WhatsApp main RTT use the reusable live-transport importer.
 - The Discord release resolver backfills missing versions from the Telegram release baseline before measuring future versions. It skips releases that predate or fail the Discord canary contract instead of reporting them as runnable gaps.
@@ -37,7 +37,7 @@ Each sample is wrapped with `/usr/bin/time` and imports process max RSS in kilob
 
 Discord is intentionally not migrated to the generic live-transport importer yet. Its summary currently omits RTT fields, so the generic importer supports observed-message timestamp fallback and has test coverage for that path, but the existing Discord workflow remains stable while the new channel lane proves itself.
 
-Telegram is listed in the channel config for the future live-transport path, but the current production graph remains on the older `pnpm rtt` package-result path because that is what release sweeps already use.
+Telegram is listed in the channel config because the package Telegram live lane now emits QA evidence for `telegram-mentioned-message-reply`. Future Telegram rows keep the same dashboard metrics as older rows, but the source is aggregate `qa-evidence.json` timing rather than the retired package RTT wrapper's per-sample result JSON.
 
 Do not read cross-channel values as pure transport rankings. Telegram release rows use `telegram-mentioned-message-reply`; Discord, Slack, and WhatsApp rows use canary scenarios. The live-transport lane also includes QA-lab process overhead in RSS because the measured process is `pnpm openclaw qa <channel>`, not only the channel adapter.
 

diff --git a/docs/data-imports.md b/docs/data-imports.md
@@ -7,8 +7,8 @@
 Run importers from the repo root:
 
 ```sh
-node scripts/import-result.mjs ../clawdbot/runs/<run-id>/result.json
-node scripts/import-result.mjs ../clawdbot/runs/<run-id>/result.json --resource-metrics resource-metrics.env
+node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live/<run>/qa-evidence.json --version <version> --started-at <iso> --finished-at <iso>
+node scripts/import-result.mjs ../openclaw/.artifacts/qa-e2e/npm-telegram-live/<run>/qa-evidence.json --version <version> --started-at <iso> --finished-at <iso> --resource-metrics resource-metrics.env
 node scripts/import-discord-rtt.mjs samples.tsv --spec openclaw@main --version <ref>
 (cd ../openclaw && node --import tsx ../openclaw-rtt/scripts/measure-rpc-rtt.mjs --output-dir ../openclaw-rtt/.artifacts/rpc-rtt/sample-1)
 node scripts/import-surface-rtt.mjs rpc-samples.tsv --surface rpc --spec openclaw@main --version <ref> --provider-mode gateway-rpc --scenario rpc-gateway-smoke --require-pass
@@ -18,13 +18,11 @@ node scripts/backfill-release-rss.mjs --family discord --spec openclaw@2026.5.16
 node scripts/summary.mjs
 ```
 
-Telegram release imports expect the `result.json` shape emitted by:
+Telegram release imports expect the aggregate `qa-evidence.json` shape emitted by the OpenClaw package Telegram live lane. The OpenClaw harness checkout must include the package Telegram RTT evidence path; older OpenClaw packages can still be the system under test.
 
 ```sh
-pnpm rtt openclaw@beta
-pnpm rtt openclaw@beta --samples 20
-pnpm rtt openclaw@latest
-pnpm rtt openclaw@2026.4.30 --provider live-frontier
+OPENCLAW_NPM_TELEGRAM_PACKAGE_SPEC=openclaw@beta \
+pnpm test:docker:npm-telegram-live
 ```
 
 ## Data Layout
@@ -36,7 +34,7 @@ pnpm rtt openclaw@2026.4.30 --provider live-frontier
 
 Current channel folders are `telegram`, `discord`, `slack`, and `whatsapp`. Telegram and Discord still have specialized importers because their source artifact shapes differ; they now share the same storage contract as generic live-transport channels.
 
-Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later.
+Raw Telegram QA artifacts stay in the OpenClaw repo artifact directory unless explicitly copied here later. Historical Telegram rows imported from the old package RTT wrapper may include `rtt.warmSamples` with every successful sample value. New Telegram imports from `qa-evidence.json` preserve the aggregate dashboard metrics (`canaryMs`, `mentionReplyMs`, `avgMs`, `p50Ms`, `p95Ms`, `maxMs`, `sampleCount`, and `failedSamples`) but do not reconstruct individual sample RTT arrays because the evidence artifact stores aggregate timing.
 
 Release RSS backfills only write `resources` onto an existing Telegram or Discord row and its copied `result.json`. The backfill command asserts the stored RTT `p50` and `p95` values are unchanged before it rewrites that version's JSONL file. RSS is process-level data around the sampled command, not isolated channel transport memory.
 

diff --git a/scripts/backfill-release-rss.mjs b/scripts/backfill-release-rss.mjs
@@ -78,7 +78,10 @@ async function readResources(args) {
   const measurement = {
     kind: "process-max-rss",
     scope: args.family === "telegram" ? "release-harness-command" : "qa-command",
-    command: args.family === "telegram" ? "pnpm rtt" : `pnpm openclaw qa ${args.family}`,
+    command:
+      args.family === "telegram"
+        ? "pnpm test:docker:npm-telegram-live"
+        : `pnpm openclaw qa ${args.family}`,
   };
   if (args.resourceMetricsPath) {
     return aggregateResources([await readResourceMetrics(path.resolve(args.resourceMetricsPath))], measurement);

diff --git a/scripts/backfill-release-rss.test.mjs b/scripts/backfill-release-rss.test.mjs
@@ -60,7 +60,7 @@ test("backfills Telegram RSS without touching RTT p50/p95", async () => {
   assert.deepEqual(updated.resources.measurement, {
     kind: "process-max-rss",
     scope: "release-harness-command",
-    command: "pnpm rtt",
+    command: "pnpm test:docker:npm-telegram-live",
   });
   assert.deepEqual(updated.resources.maxRssKbSamples, [409600]);
   assert.equal(updated.resources.maxRssKb.max, 409600);