From 8626bfdaa8dbdb37d61fb32086382915c783a381 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 31 May 2026 16:24:03 +0100 Subject: [PATCH 1/4] feat(runner): HAPI_DISABLE_VERSION_HANDOFF opt-out for mtime self-restart The heartbeat in cli/src/runner/run.ts triggers spawnHappyCLI(['runner','start']) + process.exit(0) when getInstalledCliMtimeMs() differs from startedWithCliMtimeMs. The same mtime guard fires in controlClient.isRunnerRunningCurrentlyInstalledHappyVersion when a fresh CLI invocation inspects the live runner. For operators who own process supervision (systemd, tmux, custom rebuild pipelines, etc.), source-file mtimes shift for reasons unrelated to npm upgrades. The clean exit defeats Restart=on-failure under systemd and leaves the machine offline. Setting HAPI_DISABLE_VERSION_HANDOFF=1 in the runner's environment now skips both checks while keeping the rest of the heartbeat (session pruning, state file persistence) intact. Default behavior is unchanged for npm consumers. Co-authored-by: Cursor --- cli/src/runner/controlClient.ts | 28 +++++++++------ cli/src/runner/run.ts | 64 +++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 37 deletions(-) diff --git a/cli/src/runner/controlClient.ts b/cli/src/runner/controlClient.ts index b4cf4551a..3690d2009 100644 --- a/cli/src/runner/controlClient.ts +++ b/cli/src/runner/controlClient.ts @@ -185,17 +185,25 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise setTimeout(resolve, 10_000)); - process.exit(0); + // So we can just hang forever + logger.debug('[RUNNER RUN] Hanging for a bit - waiting for CLI to kill us because we are running outdated version of the code'); + await new Promise(resolve => setTimeout(resolve, 10_000)); + process.exit(0); + } } // Before wrecklessly overriting the runner state file, we should check if we are the ones who own it From 91d5745d422fbd3d2d258c52801ff5dc56811307 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sun, 31 May 2026 16:26:38 +0100 Subject: [PATCH 2/4] fix(runner): preserve original argv across self-restart and verify handoff The mtime-driven self-restart in cli/src/runner/run.ts spawned `hapi runner start` with no arguments, then process.exit(0)'d unconditionally after a 10s sleep. Two failure modes: 1. The forwarded `runner start-sync` lost the operator's --workspace-root flags (anything passed at the original invocation). Browse + spawn silently degraded to "no workspace roots". 2. If the replacement runner failed to come up at all (build was mid-flight, binary missing, etc.) the original runner still exited cleanly. Under systemd Restart=on-failure that means no runner is brought back, and the machine drops off the hub until manual intervention. Changes: - persistence.ts: add startedWithArgv?: string[] to RunnerLocallyPersistedState - run.ts: snapshot process.argv.slice(2) at startup, persist it on initial state write and on every heartbeat, replay it as the new runner's argv (default to ['runner','start-sync'] when nothing was captured) - controlClient.ts: new waitForRunnerHandoff(oldPid, {timeoutMs}) polls runner.state.json for a different live PID - run.ts: only clearInterval + process.exit(0) when handoff is confirmed. On spawn failure or 30s timeout, refresh the mtime baseline (so we don't respawn-loop on the same drift) and stay alive so the machine keeps serving. Co-authored-by: Cursor --- cli/src/persistence.ts | 7 +++ cli/src/runner/controlClient.ts | 32 +++++++++++++ cli/src/runner/run.ts | 80 ++++++++++++++++++++++++--------- 3 files changed, 99 insertions(+), 20 deletions(-) diff --git a/cli/src/persistence.ts b/cli/src/persistence.ts index 2a4f09869..9cc10f270 100644 --- a/cli/src/persistence.ts +++ b/cli/src/persistence.ts @@ -38,6 +38,13 @@ export interface RunnerLocallyPersistedState { startedWithApiUrl?: string; startedWithMachineId?: string; startedWithCliApiTokenHash?: string; + /** + * Original process.argv.slice(2) of the runner process at start time, e.g. + * ['runner', 'start-sync', '--workspace-root', '/home/user/code']. + * Used by the self-restart handoff so the replacement runner inherits the + * same workspace-root / flag configuration instead of starting with defaults. + */ + startedWithArgv?: string[]; lastHeartbeat?: string; runnerLogPath?: string; } diff --git a/cli/src/runner/controlClient.ts b/cli/src/runner/controlClient.ts index 3690d2009..2dccdcb47 100644 --- a/cli/src/runner/controlClient.ts +++ b/cli/src/runner/controlClient.ts @@ -225,6 +225,38 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise { + const timeoutMs = options.timeoutMs ?? 30_000; + const pollIntervalMs = options.pollIntervalMs ?? 500; + const deadline = Date.now() + timeoutMs; + + while (Date.now() < deadline) { + try { + const state = await readRunnerState(); + if (state && state.pid !== oldPid && isProcessAlive(state.pid)) { + logger.debug(`[RUNNER CONTROL] Handoff confirmed: new runner PID ${state.pid} replaced ${oldPid}`); + return true; + } + } catch (error) { + logger.debug('[RUNNER CONTROL] Error polling runner state during handoff wait', error); + } + await new Promise(resolve => setTimeout(resolve, pollIntervalMs)); + } + logger.debug(`[RUNNER CONTROL] Handoff timeout: no replacement runner registered within ${timeoutMs}ms`); + return false; +} + export async function cleanupRunnerState(): Promise { try { await clearRunnerState(); diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts index 834b66754..74832760a 100644 --- a/cli/src/runner/run.ts +++ b/cli/src/runner/run.ts @@ -17,7 +17,7 @@ import { PERMISSION_MODES } from '@hapi/protocol/modes'; import { withRetry } from '@/utils/time'; import { isRetryableConnectionError } from '@/utils/errorUtils'; -import { cleanupRunnerState, getInstalledCliMtimeMs, isRunnerRunningCurrentlyInstalledHappyVersion, stopRunner } from './controlClient'; +import { cleanupRunnerState, getInstalledCliMtimeMs, isRunnerRunningCurrentlyInstalledHappyVersion, stopRunner, waitForRunnerHandoff } from './controlClient'; import { startRunnerControlServer } from './controlServer'; import { createWorktree, removeWorktree, type WorktreeInfo } from './worktree'; import { join } from 'path'; @@ -657,7 +657,16 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): onHappySessionWebhook }); - const startedWithCliMtimeMs = getInstalledCliMtimeMs(); + // Mutable: the heartbeat refreshes this baseline after a *failed* handoff + // so we do not respawn-loop on the same mtime drift every 60s. A successful + // handoff exits the process before the next tick, so the mutation is moot + // in the happy path. + let startedWithCliMtimeMs = getInstalledCliMtimeMs(); + // Snapshot original argv (excluding node/bun + entrypoint) so the heartbeat's + // self-restart handoff can rebuild the same `runner start-sync --workspace-root ...` + // invocation instead of losing flags. process.argv[0] is the runtime, [1] is the + // script; everything after is operator-supplied. + const startedWithArgv = process.argv.slice(2); // Write initial runner state (no lock needed for state file) const fileState: RunnerLocallyPersistedState = { @@ -669,6 +678,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): startedWithApiUrl: configuration.apiUrl, startedWithMachineId: machineId, startedWithCliApiTokenHash: hashRunnerCliApiToken(configuration.cliApiToken), + startedWithArgv, runnerLogPath: logger.logFilePath }; writeRunnerState(fileState); @@ -797,9 +807,10 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): // Check if runner needs update. // Skip entirely when the operator owns process supervision (systemd, tmux, - // soup rebuilds, etc.) and source mtimes change for reasons unrelated to - // an actual npm upgrade. HAPI_DISABLE_VERSION_HANDOFF=1 keeps the rest of - // the heartbeat (session pruning, state file persistence) intact. + // custom rebuild pipelines, etc.) and source mtimes change for reasons + // unrelated to an actual npm upgrade. HAPI_DISABLE_VERSION_HANDOFF=1 + // keeps the rest of the heartbeat (session pruning, state file + // persistence) intact. if (process.env.HAPI_DISABLE_VERSION_HANDOFF === '1') { if (process.env.DEBUG) { logger.debug('[RUNNER RUN] HAPI_DISABLE_VERSION_HANDOFF=1 set, skipping mtime/version drift self-restart'); @@ -809,29 +820,57 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): if (typeof installedCliMtimeMs === 'number' && typeof startedWithCliMtimeMs === 'number' && installedCliMtimeMs !== startedWithCliMtimeMs) { - logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version, clearing heartbeat interval'); + logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version'); + + // Hand off to a fresh runner that inherits our original argv (workspace + // roots, flags, etc). Previously this called `runner start` with no + // args, which forwarded an arg-less `runner start-sync` and lost the + // operator's --workspace-root configuration. Worse, the old code + // cleared the heartbeat interval and process.exit(0)'d unconditionally, + // so any failure in the replacement left the machine offline with no + // runner at all (especially under systemd Restart=on-failure, which + // does not restart on clean exits). + // + // New behaviour: + // 1. Replay the original argv (default: ['runner','start-sync']) so the + // new process boots with the same workspace roots / flags. + // 2. Wait for runner.state.json to show a different live PID, proving + // the replacement actually came up. + // 3. Only clear the heartbeat + exit when handoff is confirmed; if it + // fails, stay alive so the machine stays online. heartbeatRunning + // re-entry guard prevents this block from running concurrently if + // the next heartbeat fires while we are still waiting. + const handoffArgv = startedWithArgv.length > 0 + ? startedWithArgv + : ['runner', 'start-sync']; - clearInterval(restartOnStaleVersionAndHeartbeat); - - // Spawn new runner through the CLI - // We do not need to clean ourselves up - we will be killed by - // the CLI start command. - // 1. It will first check if runner is running (yes in this case) - // 2. If the version is stale (it will read runner.state.json file and check startedWithCliVersion) & compare it to its own version - // 3. Next it will start a new runner with the latest version with runner-sync :D - // Done! try { - spawnHappyCLI(['runner', 'start'], { + spawnHappyCLI(handoffArgv, { detached: true, stdio: 'ignore' }); } catch (error) { - logger.debug('[RUNNER RUN] Failed to spawn new runner, this is quite likely to happen during integration tests as we are cleaning out dist/ directory', error); + logger.debug('[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine', error); + startedWithCliMtimeMs = installedCliMtimeMs; + heartbeatRunning = false; + return; + } + + logger.debug(`[RUNNER RUN] Spawned replacement runner with argv: ${JSON.stringify(handoffArgv)}; waiting for handoff`); + + const handoffOk = await waitForRunnerHandoff(process.pid, { timeoutMs: 30_000 }); + if (!handoffOk) { + logger.debug('[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline'); + // Refresh baseline so we do not loop on the same mtime every tick. The + // next genuine mtime change (e.g. a later rebuild step) will retrigger + // a handoff attempt. + startedWithCliMtimeMs = installedCliMtimeMs; + heartbeatRunning = false; + return; } - // So we can just hang forever - logger.debug('[RUNNER RUN] Hanging for a bit - waiting for CLI to kill us because we are running outdated version of the code'); - await new Promise(resolve => setTimeout(resolve, 10_000)); + logger.debug('[RUNNER RUN] Handoff confirmed; clearing heartbeat and exiting cleanly'); + clearInterval(restartOnStaleVersionAndHeartbeat); process.exit(0); } } @@ -855,6 +894,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): startedWithApiUrl: fileState.startedWithApiUrl, startedWithMachineId: fileState.startedWithMachineId, startedWithCliApiTokenHash: fileState.startedWithCliApiTokenHash, + startedWithArgv, lastHeartbeat: new Date().toLocaleString(), runnerLogPath: fileState.runnerLogPath }; From a49fc57f7172b596e589a5f0d46b7c2c0012f59d Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:32:49 +0100 Subject: [PATCH 3/4] fix(runner): address Codex review findings on #814 Two Major correctness fixes flagged by upstream Codex review on PR #814: (1) Stale-mtime poisoning on failed handoff (run.ts:854,867) The previous failure paths assigned startedWithCliMtimeMs = installedCliMtimeMs which the next heartbeat persisted to runner.state.json. Downstream isRunnerRunningCurrentlyInstalledHappyVersion() then reported the still-stale runner as current, masking the failure until the *next* genuine mtime change. Symptom: an mtime change that briefly failed to hand off would be silently forgotten. Fix: leave startedWithCliMtimeMs immutable. Gate handoff entry on a new nextHandoffAttemptAt timestamp; failure paths bump it by HANDOFF_RETRY_BACKOFF_MS (5 min) via deferHandoffRetry(). The heartbeat continues to write the honest "still on the old code" mtime, and the runner naturally re-attempts after the cooldown. (2) HAPI_DISABLE_VERSION_HANDOFF not honored by live runner (controlClient.ts:192, persistence.ts) The env var was only checked in the invoking CLI process. Under the documented systemd use case the env is set on the service unit but NOT on the operator's interactive shell - so a shell `hapi runner start` would still treat mtime drift as stale and kill the supervised runner during a rebuild. The exact regression this layer was built to prevent. Fix: capture HAPI_DISABLE_VERSION_HANDOFF at runner start time into state.startedWithVersionHandoffDisabled, persisted via the heartbeat. The controlClient mtime check now OR's the live env var with the persisted snapshot, so any caller honours the running runner's opt-out regardless of their own environment. Tests: cli typecheck clean; 14/14 runner unit tests pass. Co-authored-by: Cursor --- cli/src/persistence.ts | 12 +++++++ cli/src/runner/controlClient.ts | 13 ++++++-- cli/src/runner/run.ts | 58 ++++++++++++++++++++++++--------- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/cli/src/persistence.ts b/cli/src/persistence.ts index 9cc10f270..4635a7b2f 100644 --- a/cli/src/persistence.ts +++ b/cli/src/persistence.ts @@ -47,6 +47,18 @@ export interface RunnerLocallyPersistedState { startedWithArgv?: string[]; lastHeartbeat?: string; runnerLogPath?: string; + /** + * Snapshot of HAPI_DISABLE_VERSION_HANDOFF=1 at the time this runner + * started. Lets a later `hapi runner start` invocation (from a shell where + * the env var is NOT set, e.g. operator's interactive terminal vs a + * systemd service that owns supervision) honour the running runner's + * opt-out instead of treating mtime drift as a reason to kill it. + * + * Codex review #814 [Major]: env-only check in controlClient meant the + * supervised use case (env set on service only) would still trigger a + * mid-rebuild stop. Persisting this fixes that. + */ + startedWithVersionHandoffDisabled?: boolean; } export async function readSettings(): Promise { diff --git a/cli/src/runner/controlClient.ts b/cli/src/runner/controlClient.ts index 2dccdcb47..b1d8de3c8 100644 --- a/cli/src/runner/controlClient.ts +++ b/cli/src/runner/controlClient.ts @@ -189,8 +189,17 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise { if (heartbeatRunning) { return; @@ -819,7 +838,8 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): const installedCliMtimeMs = getInstalledCliMtimeMs(); if (typeof installedCliMtimeMs === 'number' && typeof startedWithCliMtimeMs === 'number' && - installedCliMtimeMs !== startedWithCliMtimeMs) { + installedCliMtimeMs !== startedWithCliMtimeMs && + Date.now() >= nextHandoffAttemptAt) { logger.debug('[RUNNER RUN] Runner is outdated, triggering self-restart with latest version'); // Hand off to a fresh runner that inherits our original argv (workspace @@ -844,15 +864,26 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): ? startedWithArgv : ['runner', 'start-sync']; + // On any failure path: reschedule the next handoff attempt for + // HANDOFF_RETRY_BACKOFF_MS into the future. We deliberately do NOT + // touch startedWithCliMtimeMs - the heartbeat must continue to + // persist the honest "still on the old code" value into + // runner.state.json so external tooling (and the controlClient + // mtime check) can see the runner is stale. Codex review #814 + // [Major]. + const deferHandoffRetry = () => { + nextHandoffAttemptAt = Date.now() + HANDOFF_RETRY_BACKOFF_MS; + heartbeatRunning = false; + }; + try { spawnHappyCLI(handoffArgv, { detached: true, stdio: 'ignore' }); } catch (error) { - logger.debug('[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine', error); - startedWithCliMtimeMs = installedCliMtimeMs; - heartbeatRunning = false; + logger.debug(`[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`, error); + deferHandoffRetry(); return; } @@ -860,12 +891,8 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): const handoffOk = await waitForRunnerHandoff(process.pid, { timeoutMs: 30_000 }); if (!handoffOk) { - logger.debug('[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline'); - // Refresh baseline so we do not loop on the same mtime every tick. The - // next genuine mtime change (e.g. a later rebuild step) will retrigger - // a handoff attempt. - startedWithCliMtimeMs = installedCliMtimeMs; - heartbeatRunning = false; + logger.debug(`[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`); + deferHandoffRetry(); return; } @@ -895,6 +922,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): startedWithMachineId: fileState.startedWithMachineId, startedWithCliApiTokenHash: fileState.startedWithCliApiTokenHash, startedWithArgv, + startedWithVersionHandoffDisabled, lastHeartbeat: new Date().toLocaleString(), runnerLogPath: fileState.runnerLogPath }; From 4a03f42f4998af6630e9f6a8bdfba1bd0b04e3d0 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:55:24 +0100 Subject: [PATCH 4/4] fix(runner): address Codex #814 [Major] argv-capture + handoff race Two additional Major findings on the runner self-restart layer that were not addressed in a49fc57: 1. run.ts:672 - process.argv.slice(2) returns ['start-sync', ...] in compiled binary mode (raw argv is [hapi, runner, start-sync, ...]), so the handoff spawned `hapi start-sync ...` which resolveCommand treats as an unknown top-level and falls back to Claude. Replaced with getCliArgs() (the project's canonical argv normalizer) plus a defensive guard that falls back to ['runner', 'start-sync'] if the captured argv does not begin with 'runner'. 2. run.ts:892 - waitForRunnerHandoff did not actually keep the old runner alive. The child's startRunner() unconditionally called stopRunner() before acquiring the lock or writing its own state, so the parent's /stop handler resolved shutdown and exited BEFORE the child committed. If the child then failed (lock contention, auth error, anything between stopRunner and writeRunnerState), the machine went offline with no runner at all. New handoff protocol: - Parent sets HAPI_RUNNER_HANDOFF_FROM_PID= on the spawned child's env, then releases the lock BEFORE entering waitForRunnerHandoff (breaks the parent-holds-lock / child-needs-lock-to-write-state deadlock). - On wait-timeout the parent re-acquires the lock (long-retry, 30s) and defers retry; if re-acquire fails (third party took the lock) the parent exits cleanly so it does not stay alive without the lock invariant. - Child detects the env signal; if state.pid matches and that pid is alive, this is an authorized handoff: skip stopRunner(), skip the version-match early-exit, and acquire the lock with a longer retry window (60 attempts x 500ms) so it waits through the parent's asynchronous release. CLI typecheck clean. 14/14 runner unit tests still pass. The wider 46/664 failures in the CLI suite are pre-existing in this branch (unrelated: AppServerEventConverter, cursorEventConverter, hook server, Query) - baseline before this commit has 42+; my changes do not regress them. Co-authored-by: Cursor --- cli/src/runner/run.ts | 154 +++++++++++++++++++++++++++++++++++------- 1 file changed, 130 insertions(+), 24 deletions(-) diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts index b4ced8165..e10d5795b 100644 --- a/cli/src/runner/run.ts +++ b/cli/src/runner/run.ts @@ -12,6 +12,7 @@ import packageJson from '../../package.json'; import { getEnvironmentInfo } from '@/ui/doctor'; import { spawnHappyCLI } from '@/utils/spawnHappyCLI'; import { writeRunnerState, RunnerLocallyPersistedState, readRunnerState, acquireRunnerLock, releaseRunnerLock } from '@/persistence'; +import { getCliArgs } from '@/utils/cliArgs'; import { isProcessAlive, isWindows, killProcess, killProcessByChildProcess } from '@/utils/process'; import { PERMISSION_MODES } from '@hapi/protocol/modes'; import { withRetry } from '@/utils/time'; @@ -98,24 +99,73 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): logger.debug('[RUNNER RUN] Starting runner process...'); logger.debugLargeJson('[RUNNER RUN] Environment', getEnvironmentInfo()); - // Check if already running - // Check if running runner version matches current CLI version - const runningRunnerVersionMatches = await isRunnerRunningCurrentlyInstalledHappyVersion(); - if (!runningRunnerVersionMatches) { - logger.debug('[RUNNER RUN] Runner version mismatch detected, restarting runner with current CLI version'); - await stopRunner(); - } else { - logger.debug('[RUNNER RUN] Runner version matches, keeping existing runner'); - console.log('Runner already running with matching version'); - process.exit(0); + // Detect authorized handoff from a parent runner doing the + // mtime-drift self-restart (see heartbeat block below). The parent sets + // HAPI_RUNNER_HANDOFF_FROM_PID= on the spawned child's env; + // if it matches the live state pid, we are the explicit replacement and + // must NOT trigger `stopRunner()` (parent will voluntarily release the + // lock and exit once it sees our pid in state). + // + // Codex review #814 [Major] on run.ts:892 -- previously the child + // unconditionally called `stopRunner()` before acquiring the lock or + // writing its own state. `/stop` resolved the parent's shutdown, + // deleted runner.state.json, and released the lock BEFORE the child + // had committed itself; if the child then failed (lock contention, + // auth error, anything between stopRunner and writeRunnerState), the + // machine went offline with no runner at all. + // + // New protocol: + // - Parent: spawn child with HAPI_RUNNER_HANDOFF_FROM_PID env, then + // release lock, then wait for state.pid to change to a different + // live pid. On wait-timeout the parent re-acquires the lock and + // defers retry; on wait-success the parent clearInterval + exit. + // - Child: detect env, skip stopRunner(), skip the version-match + // early-exit, acquire the lock with a longer retry window (parent + // releases asynchronously, so we may need to wait). + const handoffFromPidRaw = process.env.HAPI_RUNNER_HANDOFF_FROM_PID; + const handoffFromPid = handoffFromPidRaw ? Number(handoffFromPidRaw) : NaN; + let isAuthorizedHandoff = false; + if (Number.isFinite(handoffFromPid) && handoffFromPid > 0) { + const existingState = await readRunnerState(); + if (existingState?.pid === handoffFromPid && isProcessAlive(handoffFromPid)) { + isAuthorizedHandoff = true; + logger.debug(`[RUNNER RUN] Authorized handoff from parent runner PID ${handoffFromPid}; skipping stopRunner() and version-match short-circuit`); + } else { + logger.debug(`[RUNNER RUN] HAPI_RUNNER_HANDOFF_FROM_PID=${handoffFromPidRaw} set but no matching live parent in state (state.pid=${existingState?.pid ?? 'none'}); ignoring handoff signal`); + } + } + + if (!isAuthorizedHandoff) { + // Check if already running + // Check if running runner version matches current CLI version + const runningRunnerVersionMatches = await isRunnerRunningCurrentlyInstalledHappyVersion(); + if (!runningRunnerVersionMatches) { + logger.debug('[RUNNER RUN] Runner version mismatch detected, restarting runner with current CLI version'); + await stopRunner(); + } else { + logger.debug('[RUNNER RUN] Runner version matches, keeping existing runner'); + console.log('Runner already running with matching version'); + process.exit(0); + } } - // Acquire exclusive lock (proves runner is running) - const runnerLockHandle = await acquireRunnerLock(5, 200); - if (!runnerLockHandle) { + // Acquire exclusive lock (proves runner is running). + // In authorized-handoff mode the parent is still alive holding the lock + // and will release it asynchronously once we are spawned, so wait longer + // (30s) instead of giving up after the standard 1s. Outside handoff, + // preserve the original 5x200ms = 1s "already running" semantics. + const lockMaxAttempts = isAuthorizedHandoff ? 60 : 5; + const lockDelayIncrementMs = isAuthorizedHandoff ? 500 : 200; + const initialLockHandle = await acquireRunnerLock(lockMaxAttempts, lockDelayIncrementMs); + if (!initialLockHandle) { logger.debug('[RUNNER RUN] Runner lock file already held, another runner is running'); process.exit(0); } + // `let` because the heartbeat-restart handoff path below may release and + // re-acquire it. After the null guard above, the initial handle is + // non-null; the heartbeat path's failure branches either reassign to a + // re-acquired handle or process.exit() before any subsequent release. + let runnerLockHandle = initialLockHandle; // At this point we should be safe to startup the runner: // 1. Not have a stale runner state @@ -665,11 +715,26 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): // (still-stale) runner is reported as current. Throttle handoff attempts // via nextHandoffAttemptAt below instead of mutating this baseline. const startedWithCliMtimeMs = getInstalledCliMtimeMs(); - // Snapshot original argv (excluding node/bun + entrypoint) so the heartbeat's - // self-restart handoff can rebuild the same `runner start-sync --workspace-root ...` - // invocation instead of losing flags. process.argv[0] is the runtime, [1] is the - // script; everything after is operator-supplied. - const startedWithArgv = process.argv.slice(2); + // Snapshot original CLI argv via the project's canonical normalizer so the + // heartbeat's self-restart handoff can rebuild the same `runner start-sync + // --workspace-root ...` invocation instead of losing flags. + // + // Codex review #814 [Major]: previously `process.argv.slice(2)` was used, + // but in compiled binary mode (`bun build --compile`) the raw argv shape is + // `[hapi, runner, start-sync, ...]` so slice(2) produced `['start-sync', ...]`. + // The replacement then spawned `hapi start-sync ...`, which `resolveCommand` + // treats as an unknown top-level command - falling back to Claude instead + // of starting the runner. `getCliArgs()` strips runtime + entrypoint + // correctly in all execution modes. + // + // Defensive guard: only replay the captured argv when it actually starts + // with `runner`. If the normalizer ever returns something else (unexpected + // shape, future refactor), fall back to the safe default so the handoff + // can never resolve to a non-runner command. + const rawStartedWithArgv = getCliArgs(); + const startedWithArgv = rawStartedWithArgv[0] === 'runner' + ? rawStartedWithArgv + : ['runner', 'start-sync']; // Snapshot at start: did this runner opt out of version handoff via env? // Persisted to runner.state.json so a later `hapi runner start` from a // shell where the env var is NOT set can still honour the opt-out @@ -860,9 +925,10 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): // fails, stay alive so the machine stays online. heartbeatRunning // re-entry guard prevents this block from running concurrently if // the next heartbeat fires while we are still waiting. - const handoffArgv = startedWithArgv.length > 0 - ? startedWithArgv - : ['runner', 'start-sync']; + // + // startedWithArgv is guaranteed by the normalizer + defensive guard + // above to start with `runner`, so we replay it directly. + const handoffArgv = startedWithArgv; // On any failure path: reschedule the next handoff attempt for // HANDOFF_RETRY_BACKOFF_MS into the future. We deliberately do NOT @@ -876,10 +942,18 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): heartbeatRunning = false; }; + // Spawn the replacement with HAPI_RUNNER_HANDOFF_FROM_PID set so + // the child knows it is an authorized handoff and must NOT call + // stopRunner() against us before writing its own state. + // Codex review #814 [Major] on run.ts:892. try { spawnHappyCLI(handoffArgv, { detached: true, - stdio: 'ignore' + stdio: 'ignore', + env: { + ...process.env, + HAPI_RUNNER_HANDOFF_FROM_PID: String(process.pid) + } }); } catch (error) { logger.debug(`[RUNNER RUN] Failed to spawn replacement runner; staying alive to avoid an offline machine. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`, error); @@ -887,11 +961,43 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): return; } - logger.debug(`[RUNNER RUN] Spawned replacement runner with argv: ${JSON.stringify(handoffArgv)}; waiting for handoff`); + // Release the lock so the child can acquire it. The child is + // configured (above, in its startRunner() entry) with a longer + // lock-retry window when HAPI_RUNNER_HANDOFF_FROM_PID is set, so + // it will wait through this release. We deliberately release + // BEFORE waitForRunnerHandoff to break the original deadlock: + // child cannot write state (and thus cannot signal handoff + // success) until it holds the lock, and the lock is ours until + // we release. + try { + await releaseRunnerLock(runnerLockHandle); + } catch (error) { + logger.debug('[RUNNER RUN] Failed to release lock for child handoff; continuing wait anyway', error); + } + + logger.debug(`[RUNNER RUN] Spawned replacement runner with argv: ${JSON.stringify(handoffArgv)}; released lock; waiting for handoff`); const handoffOk = await waitForRunnerHandoff(process.pid, { timeoutMs: 30_000 }); if (!handoffOk) { - logger.debug(`[RUNNER RUN] Replacement runner did not register within 30s; staying alive to avoid leaving the machine offline. Next handoff attempt in ${Math.round(HANDOFF_RETRY_BACKOFF_MS / 1000)}s.`); + logger.debug(`[RUNNER RUN] Replacement runner did not register within 30s; attempting to re-acquire lock and stay alive to avoid leaving the machine offline.`); + // Re-acquire the lock with a long window (the child has likely + // either succeeded and we're seeing a stale state, or it gave + // up - in either case the lock should be available shortly). + const reacquired = await acquireRunnerLock(60, 500); + if (!reacquired) { + // Lock is held by someone else (third-party runner, or a + // child that succeeded but state file hasn't reflected the + // new pid yet). Cleanest action: exit, log clearly. The + // operator will see an offline machine if the holder also + // dies, but staying alive without the lock invariant is + // worse - it lets a parallel runner register against the + // same machine id. + logger.debug('[RUNNER RUN] Could not re-acquire runner lock after failed handoff; another process holds it. Exiting cleanly.'); + clearInterval(restartOnStaleVersionAndHeartbeat); + process.exit(0); + return; + } + runnerLockHandle = reacquired; deferHandoffRetry(); return; }