From 8626bfdaa8dbdb37d61fb32086382915c783a381 Mon Sep 17 00:00:00 2001
From: HeavyGee <133152184+heavygee@users.noreply.github.com>
Date: Sun, 31 May 2026 16:24:03 +0100
Subject: [PATCH 1/4] feat(runner): HAPI_DISABLE_VERSION_HANDOFF opt-out for
 mtime self-restart

The heartbeat in cli/src/runner/run.ts triggers spawnHappyCLI(['runner','start'])
+ process.exit(0) when getInstalledCliMtimeMs() differs from startedWithCliMtimeMs.
The same mtime guard fires in controlClient.isRunnerRunningCurrentlyInstalledHappyVersion
when a fresh CLI invocation inspects the live runner.

For operators who own process supervision (systemd, tmux, custom rebuild
pipelines, etc.), source-file mtimes shift for reasons unrelated to npm
upgrades. The clean exit defeats Restart=on-failure under systemd and
leaves the machine offline.

Setting HAPI_DISABLE_VERSION_HANDOFF=1 in the runner's environment now skips
both checks while keeping the rest of the heartbeat (session pruning, state
file persistence) intact. Default behavior is unchanged for npm consumers.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cli/src/runner/controlClient.ts | 28 +++++++++------
 cli/src/runner/run.ts           | 64 +++++++++++++++++++--------------
 2 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/cli/src/runner/controlClient.ts b/cli/src/runner/controlClient.ts
index b4cf4551a..3690d2009 100644
--- a/cli/src/runner/controlClient.ts
+++ b/cli/src/runner/controlClient.ts
@@ -185,17 +185,25 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise<b
   const currentMachineId = settings.machineId;
   
   try {
-    const currentCliMtimeMs = getInstalledCliMtimeMs();
-    if (typeof currentCliMtimeMs === 'number' && typeof state.startedWithCliMtimeMs === 'number') {
-      logger.debug(`[RUNNER CONTROL] Current CLI mtime: ${currentCliMtimeMs}, Runner started with mtime: ${state.startedWithCliMtimeMs}`);
-      if (currentCliMtimeMs !== state.startedWithCliMtimeMs) {
-        return false;
-      }
+    // When HAPI_DISABLE_VERSION_HANDOFF=1 is set on the live runner (operator
+    // owns supervision via systemd/tmux/custom rebuild pipelines), a fresh CLI invocation must NOT
+    // treat the running runner as stale just because source mtimes shifted.
+    // Otherwise `hapi runner start` would kill the live runner mid-rebuild.
+    if (process.env.HAPI_DISABLE_VERSION_HANDOFF === '1') {
+      logger.debug('[RUNNER CONTROL] HAPI_DISABLE_VERSION_HANDOFF=1 set, skipping mtime/version drift check');
     } else {
-      const currentCliVersion = packageJson.version;
-      logger.debug(`[RUNNER CONTROL] Current CLI version: ${currentCliVersion}, Runner started with version: ${state.startedWithCliVersion}`);
-      if (currentCliVersion !== state.startedWithCliVersion) {
-        return false;
+      const currentCliMtimeMs = getInstalledCliMtimeMs();
+      if (typeof currentCliMtimeMs === 'number' && typeof state.startedWithCliMtimeMs === 'number') {
+        logger.debug(`[RUNNER CONTROL] Current CLI mtime: ${currentCliMtimeMs}, Runner started with mtime: ${state.startedWithCliMtimeMs}`);
+        if (currentCliMtimeMs !== state.startedWithCliMtimeMs) {
+          return false;
+        }
+      } else {
+        const currentCliVersion = packageJson.version;
+        logger.debug(`[RUNNER CONTROL] Current CLI version: ${currentCliVersion}, Runner started with version: ${state.startedWithCliVersion}`);
+        if (currentCliVersion !== state.startedWithCliVersion) {
+          return false;
+        }
       }
     }
 
diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts
index 3f276cf3d..834b66754 100644
--- a/cli/src/runner/run.ts
+++ b/cli/src/runner/run.ts
@@ -795,35 +795,45 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
         }
       }
 
-      // Check if runner needs update
-      const installedCliMtimeMs = getInstalledCliMtimeMs();
-      if (typeof installedCliMtimeMs === 'number' &&
-          typeof startedWithCliMtimeMs === 'number' &&
-          installedCliMtimeMs !== startedWithCliMtimeMs) {
-        logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version, clearing heartbeat interval');
-
-        clearInterval(restartOnStaleVersionAndHeartbeat);
-
-        // Spawn new runner through the CLI
-        // We do not need to clean ourselves up - we will be killed by
-        // the CLI start command.
-        // 1. It will first check if runner is running (yes in this case)
-        // 2. If the version is stale (it will read runner.state.json file and check startedWithCliVersion) & compare it to its own version
-        // 3. Next it will start a new runner with the latest version with runner-sync :D
-        // Done!
-        try {
-          spawnHappyCLI(['runner', 'start'], {
-            detached: true,
-            stdio: 'ignore'
-          });
-        } catch (error) {
-          logger.debug('[RUNNER RUN] Failed to spawn new runner, this is quite likely to happen during integration tests as we are cleaning out dist/ directory', error);
+      // Check if runner needs update.
+      // Skip entirely when the operator owns process supervision (systemd, tmux,
+      // soup rebuilds, etc.) and source mtimes change for reasons unrelated to
+      // an actual npm upgrade. HAPI_DISABLE_VERSION_HANDOFF=1 keeps the rest of
+      // the heartbeat (session pruning, state file persistence) intact.
+      if (process.env.HAPI_DISABLE_VERSION_HANDOFF === '1') {
+        if (process.env.DEBUG) {
+          logger.debug('[RUNNER RUN] HAPI_DISABLE_VERSION_HANDOFF=1 set, skipping mtime/version drift self-restart');
         }
+      } else {
+        const installedCliMtimeMs = getInstalledCliMtimeMs();
+        if (typeof installedCliMtimeMs === 'number' &&
+            typeof startedWithCliMtimeMs === 'number' &&
+            installedCliMtimeMs !== startedWithCliMtimeMs) {
+          logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version, clearing heartbeat interval');
+
+          clearInterval(restartOnStaleVersionAndHeartbeat);
+
+          // Spawn new runner through the CLI
+          // We do not need to clean ourselves up - we will be killed by
+          // the CLI start command.
+          // 1. It will first check if runner is running (yes in this case)
+          // 2. If the version is stale (it will read runner.state.json file and check startedWithCliVersion) & compare it to its own version
+          // 3. Next it will start a new runner with the latest version with runner-sync :D
+          // Done!
+          try {
+            spawnHappyCLI(['runner', 'start'], {
+              detached: true,
+              stdio: 'ignore'
+            });
+          } catch (error) {
+            logger.debug('[RUNNER RUN] Failed to spawn new runner, this is quite likely to happen during integration tests as we are cleaning out dist/ directory', error);
+          }
 
-        // So we can just hang forever
-        logger.debug('[RUNNER RUN] Hanging for a bit - waiting for CLI to kill us because we are running outdated version of the code');
-        await new Promise(resolve => setTimeout(resolve, 10_000));
-        process.exit(0);
+          // So we can just hang forever
+          logger.debug('[RUNNER RUN] Hanging for a bit - waiting for CLI to kill us because we are running outdated version of the code');
+          await new Promise(resolve => setTimeout(resolve, 10_000));
+          process.exit(0);
+        }
       }
 
       // Before wrecklessly overriting the runner state file, we should check if we are the ones who own it

From 91d5745d422fbd3d2d258c52801ff5dc56811307 Mon Sep 17 00:00:00 2001
From: HeavyGee <133152184+heavygee@users.noreply.github.com>
Date: Sun, 31 May 2026 16:26:38 +0100
Subject: [PATCH 2/4] fix(runner): preserve original argv across self-restart
 and verify handoff

The mtime-driven self-restart in cli/src/runner/run.ts spawned
`hapi runner start` with no arguments, then process.exit(0)'d
unconditionally after a 10s sleep. Two failure modes:

1. The forwarded `runner start-sync` lost the operator's --workspace-root
   flags (anything passed at the original invocation). Browse + spawn
   silently degraded to "no workspace roots".
2. If the replacement runner failed to come up at all (build was mid-flight,
   binary missing, etc.) the original runner still exited cleanly. Under
   systemd Restart=on-failure that means no runner is brought back, and
   the machine drops off the hub until manual intervention.

Changes:

- persistence.ts: add startedWithArgv?: string[] to RunnerLocallyPersistedState
- run.ts: snapshot process.argv.slice(2) at startup, persist it on initial
  state write and on every heartbeat, replay it as the new runner's argv
  (default to ['runner','start-sync'] when nothing was captured)
- controlClient.ts: new waitForRunnerHandoff(oldPid, {timeoutMs}) polls
  runner.state.json for a different live PID
- run.ts: only clearInterval + process.exit(0) when handoff is confirmed.
  On spawn failure or 30s timeout, refresh the mtime baseline (so we don't
  respawn-loop on the same drift) and stay alive so the machine keeps
  serving.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cli/src/persistence.ts          |  7 +++
 cli/src/runner/controlClient.ts | 32 +++++++++++++
 cli/src/runner/run.ts           | 80 ++++++++++++++++++++++++---------
 3 files changed, 99 insertions(+), 20 deletions(-)

diff --git a/cli/src/persistence.ts b/cli/src/persistence.ts
index 2a4f09869..9cc10f270 100644
--- a/cli/src/persistence.ts
+++ b/cli/src/persistence.ts
@@ -38,6 +38,13 @@ export interface RunnerLocallyPersistedState {
   startedWithApiUrl?: string;
   startedWithMachineId?: string;
   startedWithCliApiTokenHash?: string;
+  /**
+   * Original process.argv.slice(2) of the runner process at start time, e.g.
+   * ['runner', 'start-sync', '--workspace-root', '/home/user/code'].
+   * Used by the self-restart handoff so the replacement runner inherits the
+   * same workspace-root / flag configuration instead of starting with defaults.
+   */
+  startedWithArgv?: string[];
   lastHeartbeat?: string;
   runnerLogPath?: string;
 }
diff --git a/cli/src/runner/controlClient.ts b/cli/src/runner/controlClient.ts
index 3690d2009..2dccdcb47 100644
--- a/cli/src/runner/controlClient.ts
+++ b/cli/src/runner/controlClient.ts
@@ -225,6 +225,38 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise<b
   }
 }
 
+/**
+ * Poll the runner state file waiting for a different PID to take ownership.
+ * Used by the self-restart handoff in run.ts so the dying runner does not exit
+ * until its replacement has actually come up and written its own state.
+ *
+ * Returns true when runner.state.json shows a different (and live) PID than
+ * `oldPid`, false on timeout.
+ */
+export async function waitForRunnerHandoff(
+  oldPid: number,
+  options: { timeoutMs?: number; pollIntervalMs?: number } = {}
+): Promise<boolean> {
+  const timeoutMs = options.timeoutMs ?? 30_000;
+  const pollIntervalMs = options.pollIntervalMs ?? 500;
+  const deadline = Date.now() + timeoutMs;
+
+  while (Date.now() < deadline) {
+    try {
+      const state = await readRunnerState();
+      if (state && state.pid !== oldPid && isProcessAlive(state.pid)) {
+        logger.debug(`[RUNNER CONTROL] Handoff confirmed: new runner PID ${state.pid} replaced ${oldPid}`);
+        return true;
+      }
+    } catch (error) {
+      logger.debug('[RUNNER CONTROL] Error polling runner state during handoff wait', error);
+    }
+    await new Promise(resolve => setTimeout(resolve, pollIntervalMs));
+  }
+  logger.debug(`[RUNNER CONTROL] Handoff timeout: no replacement runner registered within ${timeoutMs}ms`);
+  return false;
+}
+
 export async function cleanupRunnerState(): Promise<void> {
   try {
     await clearRunnerState();
diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts
index 834b66754..74832760a 100644
--- a/cli/src/runner/run.ts
+++ b/cli/src/runner/run.ts
@@ -17,7 +17,7 @@ import { PERMISSION_MODES } from '@hapi/protocol/modes';
 import { withRetry } from '@/utils/time';
 import { isRetryableConnectionError } from '@/utils/errorUtils';
 
-import { cleanupRunnerState, getInstalledCliMtimeMs, isRunnerRunningCurrentlyInstalledHappyVersion, stopRunner } from './controlClient';
+import { cleanupRunnerState, getInstalledCliMtimeMs, isRunnerRunningCurrentlyInstalledHappyVersion, stopRunner, waitForRunnerHandoff } from './controlClient';
 import { startRunnerControlServer } from './controlServer';
 import { createWorktree, removeWorktree, type WorktreeInfo } from './worktree';
 import { join } from 'path';
@@ -657,7 +657,16 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
       onHappySessionWebhook
     });
 
-    const startedWithCliMtimeMs = getInstalledCliMtimeMs();
+    // Mutable: the heartbeat refreshes this baseline after a *failed* handoff
+    // so we do not respawn-loop on the same mtime drift every 60s. A successful
+    // handoff exits the process before the next tick, so the mutation is moot
+    // in the happy path.
+    let startedWithCliMtimeMs = getInstalledCliMtimeMs();
+    // Snapshot original argv (excluding node/bun + entrypoint) so the heartbeat's
+    // self-restart handoff can rebuild the same `runner start-sync --workspace-root ...`
+    // invocation instead of losing flags. process.argv[0] is the runtime, [1] is the
+    // script; everything after is operator-supplied.
+    const startedWithArgv = process.argv.slice(2);
 
     // Write initial runner state (no lock needed for state file)
     const fileState: RunnerLocallyPersistedState = {
@@ -669,6 +678,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
       startedWithApiUrl: configuration.apiUrl,
       startedWithMachineId: machineId,
       startedWithCliApiTokenHash: hashRunnerCliApiToken(configuration.cliApiToken),
+      startedWithArgv,
       runnerLogPath: logger.logFilePath
     };
     writeRunnerState(fileState);
@@ -797,9 +807,10 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
 
       // Check if runner needs update.
       // Skip entirely when the operator owns process supervision (systemd, tmux,
-      // soup rebuilds, etc.) and source mtimes change for reasons unrelated to
-      // an actual npm upgrade. HAPI_DISABLE_VERSION_HANDOFF=1 keeps the rest of
-      // the heartbeat (session pruning, state file persistence) intact.
+      // custom rebuild pipelines, etc.) and source mtimes change for reasons
+      // unrelated to an actual npm upgrade. HAPI_DISABLE_VERSION_HANDOFF=1
+      // keeps the rest of the heartbeat (session pruning, state file
+      // persistence) intact.
       if (process.env.HAPI_DISABLE_VERSION_HANDOFF === '1') {
         if (process.env.DEBUG) {
           logger.debug('[RUNNER RUN] HAPI_DISABLE_VERSION_HANDOFF=1 set, skipping mtime/version drift self-restart');
@@ -809,29 +820,57 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
         if (typeof installedCliMtimeMs === 'number' &&
             typeof startedWithCliMtimeMs === 'number' &&
             installedCliMtimeMs !== startedWithCliMtimeMs) {
-          logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version, clearing heartbeat interval');
+          logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version');
+
+          // Hand off to a fresh runner that inherits our original argv (workspace
+          // roots, flags, etc). Previously this called `runner start` with no
+          // args, which forwarded an arg-less `runner start-sync` and lost the
+          // operator's --workspace-root configuration. Worse, the old code
+          // cleared the heartbeat interval and process.exit(0)'d unconditionally,
+          // so any failure in the replacement left the machine offline with no
+          // runner at all (especially under systemd Restart=on-failure, which
+          // does not restart on clean exits).
+          //
+          // New behaviour:
+          // 1. Replay the original argv (default: ['runner','start-sync']) so the
+          //    new process boots with the same workspace roots / flags.
+          // 2. Wait for runner.state.json to show a different live PID, proving
+          //    the replacement actually came up.
+          // 3. Only clear the heartbeat + exit when handoff is confirmed; if it
+          //    fails, stay alive so the machine stays online. heartbeatRunning
+          //    re-entry guard prevents this block from running concurrently if
+          //    the next heartbeat fires while we are still waiting.
+          const handoffArgv = startedWithArgv.length > 0
+            ? startedWithArgv
+            : ['runner', 'start-sync'];
 
-          clearInterval(restartOnStaleVersionAndHeartbeat);
-
-          // Spawn new runner through the CLI
-          // We do not need to clean ourselves up - we will be killed by
-          // the CLI start command.
-          // 1. It will first check if runner is running (yes in this case)
-          // 2. If the version is stale (it will read runner.state.json file and check startedWithCliVersion) & compare it to its own version
-          // 3. Next it will start a new runner with the latest version with runner-sync :D
-          // Done!
           try {
-            spawnHappyCLI(['runner', 'start'], {
+            spawnHappyCLI(handoffArgv, {
               detached: true,
               stdio: 'ignore'
             });
           } catch (error) {
-            logger.debug('[RUNNER RUN] Failed to spawn new runner, this is quite likely to happen during integration tests as we are cleaning out dist/ directory', error);
+            logger.debug('[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine', error);
+            startedWithCliMtimeMs = installedCliMtimeMs;
+            heartbeatRunning = false;
+            return;
+          }
+
+          logger.debug(`[RUNNER RUN] Spawned replacement runner with argv: ${JSON.stringify(handoffArgv)}; waiting for handoff`);
+
+          const handoffOk = await waitForRunnerHandoff(process.pid, { timeoutMs: 30_000 });
+          if (!handoffOk) {
+            logger.debug('[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline');
+            // Refresh baseline so we do not loop on the same mtime every tick. The
+            // next genuine mtime change (e.g. a later rebuild step) will retrigger
+            // a handoff attempt.
+            startedWithCliMtimeMs = installedCliMtimeMs;
+            heartbeatRunning = false;
+            return;
           }
 
-          // So we can just hang forever
-          logger.debug('[RUNNER RUN] Hanging for a bit - waiting for CLI to kill us because we are running outdated version of the code');
-          await new Promise(resolve => setTimeout(resolve, 10_000));
+          logger.debug('[RUNNER RUN] Handoff confirmed; clearing heartbeat and exiting cleanly');
+          clearInterval(restartOnStaleVersionAndHeartbeat);
           process.exit(0);
         }
       }
@@ -855,6 +894,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
           startedWithApiUrl: fileState.startedWithApiUrl,
           startedWithMachineId: fileState.startedWithMachineId,
           startedWithCliApiTokenHash: fileState.startedWithCliApiTokenHash,
+          startedWithArgv,
           lastHeartbeat: new Date().toLocaleString(),
           runnerLogPath: fileState.runnerLogPath
         };

From a49fc57f7172b596e589a5f0d46b7c2c0012f59d Mon Sep 17 00:00:00 2001
From: HeavyGee <133152184+heavygee@users.noreply.github.com>
Date: Sat, 6 Jun 2026 16:32:49 +0100
Subject: [PATCH 3/4] fix(runner): address Codex review findings on #814

Two Major correctness fixes flagged by upstream Codex review on PR #814:

(1) Stale-mtime poisoning on failed handoff (run.ts:854,867)

The previous failure paths assigned
  startedWithCliMtimeMs = installedCliMtimeMs
which the next heartbeat persisted to runner.state.json. Downstream
isRunnerRunningCurrentlyInstalledHappyVersion() then reported the
still-stale runner as current, masking the failure until the *next*
genuine mtime change. Symptom: an mtime change that briefly failed
to hand off would be silently forgotten.

Fix: leave startedWithCliMtimeMs immutable. Gate handoff entry on
a new nextHandoffAttemptAt timestamp; failure paths bump it by
HANDOFF_RETRY_BACKOFF_MS (5 min) via deferHandoffRetry(). The
heartbeat continues to write the honest "still on the old code"
mtime, and the runner naturally re-attempts after the cooldown.

(2) HAPI_DISABLE_VERSION_HANDOFF not honored by live runner
    (controlClient.ts:192, persistence.ts)

The env var was only checked in the invoking CLI process. Under the
documented systemd use case the env is set on the service unit but
NOT on the operator's interactive shell - so a shell `hapi runner
start` would still treat mtime drift as stale and kill the supervised
runner during a rebuild. The exact regression this layer was built
to prevent.

Fix: capture HAPI_DISABLE_VERSION_HANDOFF at runner start time into
state.startedWithVersionHandoffDisabled, persisted via the heartbeat.
The controlClient mtime check now OR's the live env var with the
persisted snapshot, so any caller honours the running runner's
opt-out regardless of their own environment.

Tests: cli typecheck clean; 14/14 runner unit tests pass.
Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cli/src/persistence.ts          | 12 +++++++
 cli/src/runner/controlClient.ts | 13 ++++++--
 cli/src/runner/run.ts           | 58 ++++++++++++++++++++++++---------
 3 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/cli/src/persistence.ts b/cli/src/persistence.ts
index 9cc10f270..4635a7b2f 100644
--- a/cli/src/persistence.ts
+++ b/cli/src/persistence.ts
@@ -47,6 +47,18 @@ export interface RunnerLocallyPersistedState {
   startedWithArgv?: string[];
   lastHeartbeat?: string;
   runnerLogPath?: string;
+  /**
+   * Snapshot of HAPI_DISABLE_VERSION_HANDOFF=1 at the time this runner
+   * started. Lets a later `hapi runner start` invocation (from a shell where
+   * the env var is NOT set, e.g. operator's interactive terminal vs a
+   * systemd service that owns supervision) honour the running runner's
+   * opt-out instead of treating mtime drift as a reason to kill it.
+   *
+   * Codex review #814 [Major]: env-only check in controlClient meant the
+   * supervised use case (env set on service only) would still trigger a
+   * mid-rebuild stop. Persisting this fixes that.
+   */
+  startedWithVersionHandoffDisabled?: boolean;
 }
 
 export async function readSettings(): Promise<Settings> {
diff --git a/cli/src/runner/controlClient.ts b/cli/src/runner/controlClient.ts
index 2dccdcb47..b1d8de3c8 100644
--- a/cli/src/runner/controlClient.ts
+++ b/cli/src/runner/controlClient.ts
@@ -189,8 +189,17 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise<b
     // owns supervision via systemd/tmux/custom rebuild pipelines), a fresh CLI invocation must NOT
     // treat the running runner as stale just because source mtimes shifted.
     // Otherwise `hapi runner start` would kill the live runner mid-rebuild.
-    if (process.env.HAPI_DISABLE_VERSION_HANDOFF === '1') {
-      logger.debug('[RUNNER CONTROL] HAPI_DISABLE_VERSION_HANDOFF=1 set, skipping mtime/version drift check');
+    //
+    // Per Codex review #814 [Major]: the env var is commonly only set on the
+    // supervising service (systemd unit file), NOT on the operator's
+    // interactive shell. To honour the running runner's opt-out from any
+    // caller, OR the live env-var check with the persisted-at-start snapshot
+    // in state.startedWithVersionHandoffDisabled.
+    if (
+      process.env.HAPI_DISABLE_VERSION_HANDOFF === '1'
+      || state.startedWithVersionHandoffDisabled === true
+    ) {
+      logger.debug('[RUNNER CONTROL] Version handoff disabled (env or persisted state), skipping mtime/version drift check');
     } else {
       const currentCliMtimeMs = getInstalledCliMtimeMs();
       if (typeof currentCliMtimeMs === 'number' && typeof state.startedWithCliMtimeMs === 'number') {
diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts
index 74832760a..b4ced8165 100644
--- a/cli/src/runner/run.ts
+++ b/cli/src/runner/run.ts
@@ -657,16 +657,24 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
       onHappySessionWebhook
     });
 
-    // Mutable: the heartbeat refreshes this baseline after a *failed* handoff
-    // so we do not respawn-loop on the same mtime drift every 60s. A successful
-    // handoff exits the process before the next tick, so the mutation is moot
-    // in the happy path.
-    let startedWithCliMtimeMs = getInstalledCliMtimeMs();
+    // Baseline mtime at runner-process start. Immutable: per Codex review #814
+    // [Major], we must NOT refresh this on a failed handoff because the
+    // heartbeat writes it into runner.state.json, and downstream
+    // isRunnerRunningCurrentlyInstalledHappyVersion() compares the stored
+    // value against the live installed mtime - if they match, the live
+    // (still-stale) runner is reported as current. Throttle handoff attempts
+    // via nextHandoffAttemptAt below instead of mutating this baseline.
+    const startedWithCliMtimeMs = getInstalledCliMtimeMs();
     // Snapshot original argv (excluding node/bun + entrypoint) so the heartbeat's
     // self-restart handoff can rebuild the same `runner start-sync --workspace-root ...`
     // invocation instead of losing flags. process.argv[0] is the runtime, [1] is the
     // script; everything after is operator-supplied.
     const startedWithArgv = process.argv.slice(2);
+    // Snapshot at start: did this runner opt out of version handoff via env?
+    // Persisted to runner.state.json so a later `hapi runner start` from a
+    // shell where the env var is NOT set can still honour the opt-out
+    // (Codex review #814 [Major] - controlClient.ts:192 fix).
+    const startedWithVersionHandoffDisabled = process.env.HAPI_DISABLE_VERSION_HANDOFF === '1';
 
     // Write initial runner state (no lock needed for state file)
     const fileState: RunnerLocallyPersistedState = {
@@ -679,6 +687,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
       startedWithMachineId: machineId,
       startedWithCliApiTokenHash: hashRunnerCliApiToken(configuration.cliApiToken),
       startedWithArgv,
+      startedWithVersionHandoffDisabled,
       runnerLogPath: logger.logFilePath
     };
     writeRunnerState(fileState);
@@ -787,6 +796,16 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
     // 4. Write heartbeat
     const heartbeatIntervalMs = parseInt(process.env.HAPI_RUNNER_HEARTBEAT_INTERVAL || '60000');
     let heartbeatRunning = false
+    // Timestamp (ms) gate for handoff retries after a failed spawn or timed-out
+    // handoff. Per Codex review #814 [Major], we previously mutated
+    // startedWithCliMtimeMs to throttle re-attempts, but that polluted
+    // runner.state.json (the heartbeat persists this value) and caused
+    // isRunnerRunningCurrentlyInstalledHappyVersion() to lie about the live
+    // runner being current. The honest fix: leave startedWithCliMtimeMs
+    // immutable, gate handoff entry on this timestamp, and refresh it from
+    // the failure path with a backoff window.
+    let nextHandoffAttemptAt = 0;
+    const HANDOFF_RETRY_BACKOFF_MS = 5 * 60_000;
     const restartOnStaleVersionAndHeartbeat = setInterval(async () => {
       if (heartbeatRunning) {
         return;
@@ -819,7 +838,8 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
         const installedCliMtimeMs = getInstalledCliMtimeMs();
         if (typeof installedCliMtimeMs === 'number' &&
             typeof startedWithCliMtimeMs === 'number' &&
-            installedCliMtimeMs !== startedWithCliMtimeMs) {
+            installedCliMtimeMs !== startedWithCliMtimeMs &&
+            Date.now() >= nextHandoffAttemptAt) {
           logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version');
 
           // Hand off to a fresh runner that inherits our original argv (workspace
@@ -844,15 +864,26 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
             ? startedWithArgv
             : ['runner', 'start-sync'];
 
+          // On any failure path: reschedule the next handoff attempt for
+          // HANDOFF_RETRY_BACKOFF_MS into the future. We deliberately do NOT
+          // touch startedWithCliMtimeMs - the heartbeat must continue to
+          // persist the honest "still on the old code" value into
+          // runner.state.json so external tooling (and the controlClient
+          // mtime check) can see the runner is stale. Codex review #814
+          // [Major].
+          const deferHandoffRetry = () => {
+            nextHandoffAttemptAt = Date.now() + HANDOFF_RETRY_BACKOFF_MS;
+            heartbeatRunning = false;
+          };
+
           try {
             spawnHappyCLI(handoffArgv, {
               detached: true,
               stdio: 'ignore'
             });
           } catch (error) {
-            logger.debug('[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine', error);
-            startedWithCliMtimeMs = installedCliMtimeMs;
-            heartbeatRunning = false;
+            logger.debug(`[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`, error);
+            deferHandoffRetry();
             return;
           }
 
@@ -860,12 +891,8 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
 
           const handoffOk = await waitForRunnerHandoff(process.pid, { timeoutMs: 30_000 });
           if (!handoffOk) {
-            logger.debug('[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline');
-            // Refresh baseline so we do not loop on the same mtime every tick. The
-            // next genuine mtime change (e.g. a later rebuild step) will retrigger
-            // a handoff attempt.
-            startedWithCliMtimeMs = installedCliMtimeMs;
-            heartbeatRunning = false;
+            logger.debug(`[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`);
+            deferHandoffRetry();
             return;
           }
 
@@ -895,6 +922,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
           startedWithMachineId: fileState.startedWithMachineId,
           startedWithCliApiTokenHash: fileState.startedWithCliApiTokenHash,
           startedWithArgv,
+          startedWithVersionHandoffDisabled,
           lastHeartbeat: new Date().toLocaleString(),
           runnerLogPath: fileState.runnerLogPath
         };

From 4a03f42f4998af6630e9f6a8bdfba1bd0b04e3d0 Mon Sep 17 00:00:00 2001
From: HeavyGee <133152184+heavygee@users.noreply.github.com>
Date: Sat, 6 Jun 2026 16:55:24 +0100
Subject: [PATCH 4/4] fix(runner): address Codex #814 [Major] argv-capture +
 handoff race

Two additional Major findings on the runner self-restart layer that
were not addressed in a49fc57:

1. run.ts:672 - process.argv.slice(2) returns ['start-sync', ...] in
   compiled binary mode (raw argv is [hapi, runner, start-sync, ...]),
   so the handoff spawned `hapi start-sync ...` which resolveCommand
   treats as an unknown top-level and falls back to Claude. Replaced
   with getCliArgs() (the project's canonical argv normalizer) plus a
   defensive guard that falls back to ['runner', 'start-sync'] if the
   captured argv does not begin with 'runner'.

2. run.ts:892 - waitForRunnerHandoff did not actually keep the old
   runner alive. The child's startRunner() unconditionally called
   stopRunner() before acquiring the lock or writing its own state,
   so the parent's /stop handler resolved shutdown and exited BEFORE
   the child committed. If the child then failed (lock contention,
   auth error, anything between stopRunner and writeRunnerState),
   the machine went offline with no runner at all.

   New handoff protocol:
   - Parent sets HAPI_RUNNER_HANDOFF_FROM_PID=<pid> on the spawned
     child's env, then releases the lock BEFORE entering
     waitForRunnerHandoff (breaks the parent-holds-lock /
     child-needs-lock-to-write-state deadlock).
   - On wait-timeout the parent re-acquires the lock (long-retry, 30s)
     and defers retry; if re-acquire fails (third party took the
     lock) the parent exits cleanly so it does not stay alive without
     the lock invariant.
   - Child detects the env signal; if state.pid matches and that pid
     is alive, this is an authorized handoff: skip stopRunner(),
     skip the version-match early-exit, and acquire the lock with a
     longer retry window (60 attempts x 500ms) so it waits through
     the parent's asynchronous release.

CLI typecheck clean. 14/14 runner unit tests still pass. The wider
46/664 failures in the CLI suite are pre-existing in this branch
(unrelated: AppServerEventConverter, cursorEventConverter, hook
server, Query) - baseline before this commit has 42+; my changes do
not regress them.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cli/src/runner/run.ts | 154 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 130 insertions(+), 24 deletions(-)

diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts
index b4ced8165..e10d5795b 100644
--- a/cli/src/runner/run.ts
+++ b/cli/src/runner/run.ts
@@ -12,6 +12,7 @@ import packageJson from '../../package.json';
 import { getEnvironmentInfo } from '@/ui/doctor';
 import { spawnHappyCLI } from '@/utils/spawnHappyCLI';
 import { writeRunnerState, RunnerLocallyPersistedState, readRunnerState, acquireRunnerLock, releaseRunnerLock } from '@/persistence';
+import { getCliArgs } from '@/utils/cliArgs';
 import { isProcessAlive, isWindows, killProcess, killProcessByChildProcess } from '@/utils/process';
 import { PERMISSION_MODES } from '@hapi/protocol/modes';
 import { withRetry } from '@/utils/time';
@@ -98,24 +99,73 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
   logger.debug('[RUNNER RUN] Starting runner process...');
   logger.debugLargeJson('[RUNNER RUN] Environment', getEnvironmentInfo());
 
-  // Check if already running
-  // Check if running runner version matches current CLI version
-  const runningRunnerVersionMatches = await isRunnerRunningCurrentlyInstalledHappyVersion();
-  if (!runningRunnerVersionMatches) {
-    logger.debug('[RUNNER RUN] Runner version mismatch detected, restarting runner with current CLI version');
-    await stopRunner();
-  } else {
-    logger.debug('[RUNNER RUN] Runner version matches, keeping existing runner');
-    console.log('Runner already running with matching version');
-    process.exit(0);
+  // Detect authorized handoff from a parent runner doing the
+  // mtime-drift self-restart (see heartbeat block below). The parent sets
+  // HAPI_RUNNER_HANDOFF_FROM_PID=<parent_pid> on the spawned child's env;
+  // if it matches the live state pid, we are the explicit replacement and
+  // must NOT trigger `stopRunner()` (parent will voluntarily release the
+  // lock and exit once it sees our pid in state).
+  //
+  // Codex review #814 [Major] on run.ts:892 -- previously the child
+  // unconditionally called `stopRunner()` before acquiring the lock or
+  // writing its own state. `/stop` resolved the parent's shutdown,
+  // deleted runner.state.json, and released the lock BEFORE the child
+  // had committed itself; if the child then failed (lock contention,
+  // auth error, anything between stopRunner and writeRunnerState), the
+  // machine went offline with no runner at all.
+  //
+  // New protocol:
+  //   - Parent: spawn child with HAPI_RUNNER_HANDOFF_FROM_PID env, then
+  //     release lock, then wait for state.pid to change to a different
+  //     live pid. On wait-timeout the parent re-acquires the lock and
+  //     defers retry; on wait-success the parent clearInterval + exit.
+  //   - Child: detect env, skip stopRunner(), skip the version-match
+  //     early-exit, acquire the lock with a longer retry window (parent
+  //     releases asynchronously, so we may need to wait).
+  const handoffFromPidRaw = process.env.HAPI_RUNNER_HANDOFF_FROM_PID;
+  const handoffFromPid = handoffFromPidRaw ? Number(handoffFromPidRaw) : NaN;
+  let isAuthorizedHandoff = false;
+  if (Number.isFinite(handoffFromPid) && handoffFromPid > 0) {
+    const existingState = await readRunnerState();
+    if (existingState?.pid === handoffFromPid && isProcessAlive(handoffFromPid)) {
+      isAuthorizedHandoff = true;
+      logger.debug(`[RUNNER RUN] Authorized handoff from parent runner PID ${handoffFromPid}; skipping stopRunner() and version-match short-circuit`);
+    } else {
+      logger.debug(`[RUNNER RUN] HAPI_RUNNER_HANDOFF_FROM_PID=${handoffFromPidRaw} set but no matching live parent in state (state.pid=${existingState?.pid ?? 'none'}); ignoring handoff signal`);
+    }
+  }
+
+  if (!isAuthorizedHandoff) {
+    // Check if already running
+    // Check if running runner version matches current CLI version
+    const runningRunnerVersionMatches = await isRunnerRunningCurrentlyInstalledHappyVersion();
+    if (!runningRunnerVersionMatches) {
+      logger.debug('[RUNNER RUN] Runner version mismatch detected, restarting runner with current CLI version');
+      await stopRunner();
+    } else {
+      logger.debug('[RUNNER RUN] Runner version matches, keeping existing runner');
+      console.log('Runner already running with matching version');
+      process.exit(0);
+    }
   }
 
-  // Acquire exclusive lock (proves runner is running)
-  const runnerLockHandle = await acquireRunnerLock(5, 200);
-  if (!runnerLockHandle) {
+  // Acquire exclusive lock (proves runner is running).
+  // In authorized-handoff mode the parent is still alive holding the lock
+  // and will release it asynchronously once we are spawned, so wait longer
+  // (30s) instead of giving up after the standard 1s. Outside handoff,
+  // preserve the original 5x200ms = 1s "already running" semantics.
+  const lockMaxAttempts = isAuthorizedHandoff ? 60 : 5;
+  const lockDelayIncrementMs = isAuthorizedHandoff ? 500 : 200;
+  const initialLockHandle = await acquireRunnerLock(lockMaxAttempts, lockDelayIncrementMs);
+  if (!initialLockHandle) {
     logger.debug('[RUNNER RUN] Runner lock file already held, another runner is running');
     process.exit(0);
   }
+  // `let` because the heartbeat-restart handoff path below may release and
+  // re-acquire it. After the null guard above, the initial handle is
+  // non-null; the heartbeat path's failure branches either reassign to a
+  // re-acquired handle or process.exit() before any subsequent release.
+  let runnerLockHandle = initialLockHandle;
 
   // At this point we should be safe to startup the runner:
   // 1. Not have a stale runner state
@@ -665,11 +715,26 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
     // (still-stale) runner is reported as current. Throttle handoff attempts
     // via nextHandoffAttemptAt below instead of mutating this baseline.
     const startedWithCliMtimeMs = getInstalledCliMtimeMs();
-    // Snapshot original argv (excluding node/bun + entrypoint) so the heartbeat's
-    // self-restart handoff can rebuild the same `runner start-sync --workspace-root ...`
-    // invocation instead of losing flags. process.argv[0] is the runtime, [1] is the
-    // script; everything after is operator-supplied.
-    const startedWithArgv = process.argv.slice(2);
+    // Snapshot original CLI argv via the project's canonical normalizer so the
+    // heartbeat's self-restart handoff can rebuild the same `runner start-sync
+    // --workspace-root ...` invocation instead of losing flags.
+    //
+    // Codex review #814 [Major]: previously `process.argv.slice(2)` was used,
+    // but in compiled binary mode (`bun build --compile`) the raw argv shape is
+    // `[hapi, runner, start-sync, ...]` so slice(2) produced `['start-sync', ...]`.
+    // The replacement then spawned `hapi start-sync ...`, which `resolveCommand`
+    // treats as an unknown top-level command - falling back to Claude instead
+    // of starting the runner. `getCliArgs()` strips runtime + entrypoint
+    // correctly in all execution modes.
+    //
+    // Defensive guard: only replay the captured argv when it actually starts
+    // with `runner`. If the normalizer ever returns something else (unexpected
+    // shape, future refactor), fall back to the safe default so the handoff
+    // can never resolve to a non-runner command.
+    const rawStartedWithArgv = getCliArgs();
+    const startedWithArgv = rawStartedWithArgv[0] === 'runner'
+      ? rawStartedWithArgv
+      : ['runner', 'start-sync'];
     // Snapshot at start: did this runner opt out of version handoff via env?
     // Persisted to runner.state.json so a later `hapi runner start` from a
     // shell where the env var is NOT set can still honour the opt-out
@@ -860,9 +925,10 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
           //    fails, stay alive so the machine stays online. heartbeatRunning
           //    re-entry guard prevents this block from running concurrently if
           //    the next heartbeat fires while we are still waiting.
-          const handoffArgv = startedWithArgv.length > 0
-            ? startedWithArgv
-            : ['runner', 'start-sync'];
+          //
+          // startedWithArgv is guaranteed by the normalizer + defensive guard
+          // above to start with `runner`, so we replay it directly.
+          const handoffArgv = startedWithArgv;
 
           // On any failure path: reschedule the next handoff attempt for
           // HANDOFF_RETRY_BACKOFF_MS into the future. We deliberately do NOT
@@ -876,10 +942,18 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
             heartbeatRunning = false;
           };
 
+          // Spawn the replacement with HAPI_RUNNER_HANDOFF_FROM_PID set so
+          // the child knows it is an authorized handoff and must NOT call
+          // stopRunner() against us before writing its own state.
+          // Codex review #814 [Major] on run.ts:892.
           try {
             spawnHappyCLI(handoffArgv, {
               detached: true,
-              stdio: 'ignore'
+              stdio: 'ignore',
+              env: {
+                ...process.env,
+                HAPI_RUNNER_HANDOFF_FROM_PID: String(process.pid)
+              }
             });
           } catch (error) {
             logger.debug(`[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`, error);
@@ -887,11 +961,43 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}):
             return;
           }
 
-          logger.debug(`[RUNNER RUN] Spawned replacement runner with argv: ${JSON.stringify(handoffArgv)}; waiting for handoff`);
+          // Release the lock so the child can acquire it. The child is
+          // configured (above, in its startRunner() entry) with a longer
+          // lock-retry window when HAPI_RUNNER_HANDOFF_FROM_PID is set, so
+          // it will wait through this release. We deliberately release
+          // BEFORE waitForRunnerHandoff to break the original deadlock:
+          // child cannot write state (and thus cannot signal handoff
+          // success) until it holds the lock, and the lock is ours until
+          // we release.
+          try {
+            await releaseRunnerLock(runnerLockHandle);
+          } catch (error) {
+            logger.debug('[RUNNER RUN] Failed to release lock for child handoff; continuing wait anyway', error);
+          }
+
+          logger.debug(`[RUNNER RUN] Spawned replacement runner with argv: ${JSON.stringify(handoffArgv)}; released lock; waiting for handoff`);
 
           const handoffOk = await waitForRunnerHandoff(process.pid, { timeoutMs: 30_000 });
           if (!handoffOk) {
-            logger.debug(`[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`);
+            logger.debug(`[RUNNER RUN] Replacement runner did not register within 30s; attempting to re-acquire lock and stay alive to avoid leaving the machine offline.`);
+            // Re-acquire the lock with a long window (the child has likely
+            // either succeeded and we're seeing a stale state, or it gave
+            // up - in either case the lock should be available shortly).
+            const reacquired = await acquireRunnerLock(60, 500);
+            if (!reacquired) {
+              // Lock is held by someone else (third-party runner, or a
+              // child that succeeded but state file hasn't reflected the
+              // new pid yet). Cleanest action: exit, log clearly. The
+              // operator will see an offline machine if the holder also
+              // dies, but staying alive without the lock invariant is
+              // worse - it lets a parallel runner register against the
+              // same machine id.
+              logger.debug('[RUNNER RUN] Could not re-acquire runner lock after failed handoff; another process holds it. Exiting cleanly.');
+              clearInterval(restartOnStaleVersionAndHeartbeat);
+              process.exit(0);
+              return;
+            }
+            runnerLockHandle = reacquired;
             deferHandoffRetry();
             return;
           }