Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions cli/src/persistence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,27 @@ export interface RunnerLocallyPersistedState {
startedWithApiUrl?: string;
startedWithMachineId?: string;
startedWithCliApiTokenHash?: string;
/**
* Original process.argv.slice(2) of the runner process at start time, e.g.
* ['runner', 'start-sync', '--workspace-root', '/home/user/code'].
* Used by the self-restart handoff so the replacement runner inherits the
* same workspace-root / flag configuration instead of starting with defaults.
*/
startedWithArgv?: string[];
lastHeartbeat?: string;
runnerLogPath?: string;
/**
* Snapshot of HAPI_DISABLE_VERSION_HANDOFF=1 at the time this runner
* started. Lets a later `hapi runner start` invocation (from a shell where
* the env var is NOT set, e.g. operator's interactive terminal vs a
* systemd service that owns supervision) honour the running runner's
* opt-out instead of treating mtime drift as a reason to kill it.
*
* Codex review #814 [Major]: env-only check in controlClient meant the
* supervised use case (env set on service only) would still trigger a
* mid-rebuild stop. Persisting this fixes that.
*/
startedWithVersionHandoffDisabled?: boolean;
}

export async function readSettings(): Promise<Settings> {
Expand Down
69 changes: 59 additions & 10 deletions cli/src/runner/controlClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,17 +185,34 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise<b
const currentMachineId = settings.machineId;

try {
const currentCliMtimeMs = getInstalledCliMtimeMs();
if (typeof currentCliMtimeMs === 'number' && typeof state.startedWithCliMtimeMs === 'number') {
logger.debug(`[RUNNER CONTROL] Current CLI mtime: ${currentCliMtimeMs}, Runner started with mtime: ${state.startedWithCliMtimeMs}`);
if (currentCliMtimeMs !== state.startedWithCliMtimeMs) {
return false;
}
// When HAPI_DISABLE_VERSION_HANDOFF=1 is set on the live runner (operator
// owns supervision via systemd/tmux/custom rebuild pipelines), a fresh CLI invocation must NOT
// treat the running runner as stale just because source mtimes shifted.
// Otherwise `hapi runner start` would kill the live runner mid-rebuild.
//
// Per Codex review #814 [Major]: the env var is commonly only set on the
// supervising service (systemd unit file), NOT on the operator's
// interactive shell. To honour the running runner's opt-out from any
// caller, OR the live env-var check with the persisted-at-start snapshot
// in state.startedWithVersionHandoffDisabled.
if (
process.env.HAPI_DISABLE_VERSION_HANDOFF === '1'
|| state.startedWithVersionHandoffDisabled === true
) {
logger.debug('[RUNNER CONTROL] Version handoff disabled (env or persisted state), skipping mtime/version drift check');
} else {
const currentCliVersion = packageJson.version;
logger.debug(`[RUNNER CONTROL] Current CLI version: ${currentCliVersion}, Runner started with version: ${state.startedWithCliVersion}`);
if (currentCliVersion !== state.startedWithCliVersion) {
return false;
const currentCliMtimeMs = getInstalledCliMtimeMs();
if (typeof currentCliMtimeMs === 'number' && typeof state.startedWithCliMtimeMs === 'number') {
logger.debug(`[RUNNER CONTROL] Current CLI mtime: ${currentCliMtimeMs}, Runner started with mtime: ${state.startedWithCliMtimeMs}`);
if (currentCliMtimeMs !== state.startedWithCliMtimeMs) {
return false;
}
} else {
const currentCliVersion = packageJson.version;
logger.debug(`[RUNNER CONTROL] Current CLI version: ${currentCliVersion}, Runner started with version: ${state.startedWithCliVersion}`);
if (currentCliVersion !== state.startedWithCliVersion) {
return false;
}
}
}

Expand All @@ -217,6 +234,38 @@ export async function isRunnerRunningCurrentlyInstalledHappyVersion(): Promise<b
}
}

/**
* Poll the runner state file waiting for a different PID to take ownership.
* Used by the self-restart handoff in run.ts so the dying runner does not exit
* until its replacement has actually come up and written its own state.
*
* Returns true when runner.state.json shows a different (and live) PID than
* `oldPid`, false on timeout.
*/
export async function waitForRunnerHandoff(
oldPid: number,
options: { timeoutMs?: number; pollIntervalMs?: number } = {}
): Promise<boolean> {
const timeoutMs = options.timeoutMs ?? 30_000;
const pollIntervalMs = options.pollIntervalMs ?? 500;
const deadline = Date.now() + timeoutMs;

while (Date.now() < deadline) {
try {
const state = await readRunnerState();
if (state && state.pid !== oldPid && isProcessAlive(state.pid)) {
logger.debug(`[RUNNER CONTROL] Handoff confirmed: new runner PID ${state.pid} replaced ${oldPid}`);
return true;
}
} catch (error) {
logger.debug('[RUNNER CONTROL] Error polling runner state during handoff wait', error);
}
await new Promise(resolve => setTimeout(resolve, pollIntervalMs));
}
logger.debug(`[RUNNER CONTROL] Handoff timeout: no replacement runner registered within ${timeoutMs}ms`);
return false;
}

export async function cleanupRunnerState(): Promise<void> {
try {
await clearRunnerState();
Expand Down
Loading
Loading