From 928b1b6e78cd2b0e8c12d7cf4f001ab6e830a8e0 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 15 Jun 2026 19:28:48 +0100 Subject: [PATCH 1/5] fix(hub,cli): four hub-restart-cascade cleanup bugs (#913 #914 #916 #919) These four contained bugs were uncovered by a 2026-06-15 hub-restart incident where `hapi-restart-hub` SIGTERMed 23 cursor ACP sessions. Each fix lands independently of the architectural #915 (hub-restart cascade-archive) and the hypothesis-pending #917 (reopen creates dead session); audit-trail correctness and idempotency wins stand on their own. #913 - cursor ACP fresh-session: cursorSessionId persistence race. Fresh ACP sessions could be SIGTERMed during the async `update-metadata` ACK round-trip, stranding the on-disk ACP store with no DB handle. Add `ApiSessionClient.flushMetadata()` and await it after `onSessionFoundWithProtocol` on the fresh-session branch. Resume-path pre-registration (PR #834) is unchanged. #914 - default archiveReason 'User terminated' polluted audit trail. Hub-restart-cascade SIGTERMs went through the same path as web-UI Archive clicks, both writing archiveReason='User terminated'. New default is 'Hub restart'; the KillSession RPC handler (the authoritative user-archive signal) now explicitly stamps 'User terminated' before cleanupAndExit. SIGINT (local-terminal Ctrl-C) keeps the 'User terminated' label too. #916 - POST /api/sessions/:id/archive returned 500 when CLI is gone. `rpcGateway.killSession` threw a generic Error when no target socket was registered, and the archive route surfaced that as 500. Add typed `RpcTargetMissingError`, narrow on it in `syncEngine.archiveSession`, fall back to a hub-side `markSessionArchivedFromHub` write so lifecycleState still flips to 'archived'. Drop the requireActive guard on the route and 2xx-noop for already-archived rows. #919 - sessionCache: three metadata writers threw on version-mismatch without refresh, producing forever-409 on rename/reopen until an unrelated event triggered a cache refresh. `renameSession`, `clearSessionArchiveMetadata`, `restoreSessionArchiveMetadata` now retry-with-refresh (5 attempts, then throw) mirroring the existing good pattern in `mergeSessions`. Refs tiann/hapi#913 Refs tiann/hapi#914 Refs tiann/hapi#916 Refs tiann/hapi#919 AI disclosure: implementation by Claude Sonnet 4.5 (Cursor agent peer) under operator supervision. Issue triage by a sibling discovery agent. Per CONTRIBUTING.md AI-assisted contributions policy. Co-authored-by: Cursor --- cli/src/agent/runnerLifecycle.test.ts | 73 +++++ cli/src/agent/runnerLifecycle.ts | 19 +- cli/src/api/apiSession.ts | 15 + .../claude/registerKillSessionHandler.test.ts | 65 +++++ cli/src/claude/registerKillSessionHandler.ts | 29 +- cli/src/claude/runClaude.ts | 2 +- cli/src/codex/runCodex.ts | 2 +- .../cursor/cursorAcpRemoteLauncher.test.ts | 66 +++++ cli/src/cursor/cursorAcpRemoteLauncher.ts | 12 + cli/src/cursor/runCursor.ts | 2 +- cli/src/gemini/runGemini.ts | 2 +- cli/src/kimi/runKimi.ts | 2 +- cli/src/opencode/runOpencode.ts | 2 +- hub/src/sync/rpcGateway.test.ts | 47 ++- hub/src/sync/rpcGateway.ts | 25 +- hub/src/sync/sessionCache.ts | 274 ++++++++++++------ hub/src/sync/sessionModel.test.ts | 209 +++++++++++++ hub/src/sync/syncEngine.ts | 20 +- hub/src/web/routes/sessions.test.ts | 87 ++++++ hub/src/web/routes/sessions.ts | 11 +- 20 files changed, 854 insertions(+), 110 deletions(-) create mode 100644 cli/src/agent/runnerLifecycle.test.ts create mode 100644 cli/src/claude/registerKillSessionHandler.test.ts diff --git a/cli/src/agent/runnerLifecycle.test.ts b/cli/src/agent/runnerLifecycle.test.ts new file mode 100644 index 0000000000..7b3ac02b6f --- /dev/null +++ b/cli/src/agent/runnerLifecycle.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it, vi } from 'vitest' +import { createRunnerLifecycle } from './runnerLifecycle' + +// tiann/hapi#914: the runnerLifecycle's default archiveReason is now +// 'Hub restart' (was 'User terminated'). Out-of-band SIGTERM from the +// hub-restart cascade keeps that default. Explicit user actions +// (clicking Archive in the web UI, Ctrl-C in a local terminal, +// uncaught exception) reassign the reason before archive metadata is +// written. + +function makeFakeSession() { + const metadataWrites: Array> = [] + return { + updateMetadata: vi.fn((handler: (m: Record) => Record) => { + const next = handler({}) + metadataWrites.push(next) + return next + }), + sendSessionDeath: vi.fn(), + flush: vi.fn(async () => {}), + close: vi.fn(async () => {}), + metadataWrites + } +} + +describe('createRunnerLifecycle archiveReason defaults (tiann/hapi#914)', () => { + it('uses Hub restart as the default archiveReason when no override is applied', async () => { + const session = makeFakeSession() + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + await lifecycle.cleanup() + + expect(session.metadataWrites).toHaveLength(1) + expect(session.metadataWrites[0]).toMatchObject({ + lifecycleState: 'archived', + archivedBy: 'cli', + archiveReason: 'Hub restart' + }) + }) + + it('writes the operator-supplied reason when setArchiveReason is called (e.g. KillSession RPC)', async () => { + const session = makeFakeSession() + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + lifecycle.setArchiveReason('User terminated') + await lifecycle.cleanup() + + expect(session.metadataWrites[0]).toMatchObject({ + archiveReason: 'User terminated' + }) + }) + + it('markCrash overrides the default reason to "Session crashed"', async () => { + const session = makeFakeSession() + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + lifecycle.markCrash(new Error('boom')) + await lifecycle.cleanup() + + expect(session.metadataWrites[0]).toMatchObject({ + archiveReason: 'Session crashed' + }) + }) +}) diff --git a/cli/src/agent/runnerLifecycle.ts b/cli/src/agent/runnerLifecycle.ts index 0ae8faa9e2..1c25f757c8 100644 --- a/cli/src/agent/runnerLifecycle.ts +++ b/cli/src/agent/runnerLifecycle.ts @@ -23,7 +23,16 @@ export type RunnerLifecycle = { export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLifecycle { let exitCode = 0 - let archiveReason = 'User terminated' + // tiann/hapi#914: default reason is 'Hub restart' (parent-driven SIGTERM + // is the most common non-user cause). Genuine user actions (clicking + // Archive in the web UI, or Ctrl-C in a local terminal) explicitly + // reassign this via `setArchiveReason` BEFORE `cleanupAndExit` runs: + // - KillSession RPC handler → 'User terminated' (see registerKillSessionHandler) + // - SIGINT handler → 'User terminated' (Ctrl-C in local terminal) + // - uncaughtException/Reject → 'Session crashed' (via markCrash) + // Out-of-band SIGTERM (hub-restart cascade, `kill ` from host) keeps + // the default and is correctly labelled 'Hub restart' on the audit trail. + let archiveReason = 'Hub restart' let sessionEndReason: SessionEndReason = 'terminated' let cleanupStarted = false let cleanupPromise: Promise | null = null @@ -105,11 +114,19 @@ export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLi } const registerProcessHandlers = () => { + // tiann/hapi#914: SIGTERM is treated as the default reason ('Hub restart') + // because the runner is restarted by systemd as part of hub restart in + // production. If a future code path needs to distinguish "operator + // killed the host process" from "hub restart", it can call + // setArchiveReason() before the runner exits. process.on('SIGTERM', () => { void cleanupAndExit() }) + // Ctrl-C in a local terminal is genuine user intent — keep the + // pre-#914 label so the audit trail still shows it. process.on('SIGINT', () => { + archiveReason = 'User terminated' void cleanupAndExit() }) diff --git a/cli/src/api/apiSession.ts b/cli/src/api/apiSession.ts index d187eba2b6..6156115109 100644 --- a/cli/src/api/apiSession.ts +++ b/cli/src/api/apiSession.ts @@ -739,6 +739,21 @@ export class ApiSessionClient extends EventEmitter { }) } + /** + * tiann/hapi#913: wait until any pending `update-metadata` writes have + * been acked by the hub (or the timeout elapses). `updateMetadata` is + * fire-and-forget at the call site because it's invoked on the hot path + * for every turn; this helper lets the few callers who actually need + * durability — fresh ACP session-id pre-registration is the canonical + * case — synchronously gate on persistence without changing every + * caller's signature. + * + * Returns true when the lock drained, false when the timeout fired. + */ + async flushMetadata(timeoutMs: number = 5_000): Promise { + return await this.drainLock(this.metadataLock, timeoutMs) + } + async flush(options?: { timeoutMs?: number }): Promise { const deadlineMs = Date.now() + (options?.timeoutMs ?? 5_000) diff --git a/cli/src/claude/registerKillSessionHandler.test.ts b/cli/src/claude/registerKillSessionHandler.test.ts new file mode 100644 index 0000000000..b293172955 --- /dev/null +++ b/cli/src/claude/registerKillSessionHandler.test.ts @@ -0,0 +1,65 @@ +import { describe, expect, it, vi } from 'vitest' +import { RPC_METHODS } from '@hapi/protocol/rpcMethods' +import { registerKillSessionHandler } from './registerKillSessionHandler' + +// tiann/hapi#914: the KillSession RPC is the authoritative "user-terminated" +// signal because the hub only sends it when the operator clicks Archive in +// the web UI. Out-of-band SIGTERM (hub-restart cascade, host-level `kill`) +// hits the SIGTERM signal handler in runnerLifecycle, which now keeps the +// default reason 'Hub restart' so the audit trail stays correct. +describe('registerKillSessionHandler (tiann/hapi#914)', () => { + function makeRegistry() { + const handlers = new Map unknown>() + return { + registerHandler: (method: string, handler: (params: unknown) => unknown) => { + handlers.set(method, handler as (params?: unknown) => unknown) + }, + handlers + } + } + + it('stamps archiveReason=User terminated before triggering cleanupAndExit', async () => { + const registry = makeRegistry() + const lifecycle = { + setArchiveReason: vi.fn(), + cleanupAndExit: vi.fn(async () => {}) + } + + registerKillSessionHandler( + registry as unknown as Parameters[0], + lifecycle + ) + + const handler = registry.handlers.get(RPC_METHODS.KillSession) + expect(handler).toBeDefined() + + const result = await handler?.() + expect(result).toEqual({ success: true, message: 'Killing hapi CLI process' }) + + // setArchiveReason MUST be called BEFORE cleanupAndExit so the archive + // metadata write reads the correct reason. + const setReasonOrder = lifecycle.setArchiveReason.mock.invocationCallOrder[0] + const cleanupOrder = lifecycle.cleanupAndExit.mock.invocationCallOrder[0] + expect(setReasonOrder).toBeLessThan(cleanupOrder) + expect(lifecycle.setArchiveReason).toHaveBeenCalledWith('User terminated') + expect(lifecycle.cleanupAndExit).toHaveBeenCalled() + }) + + it('still works with the legacy `(cleanupAndExit: () => Promise)` call shape', async () => { + // Back-compat: runAgentSession.ts passes a bare closure as the second + // argument instead of a lifecycle object. The handler should not crash + // when setArchiveReason is absent. + const registry = makeRegistry() + const cleanupAndExit = vi.fn(async () => {}) + + registerKillSessionHandler( + registry as unknown as Parameters[0], + cleanupAndExit + ) + + const handler = registry.handlers.get(RPC_METHODS.KillSession) + await handler?.() + + expect(cleanupAndExit).toHaveBeenCalled() + }) +}) diff --git a/cli/src/claude/registerKillSessionHandler.ts b/cli/src/claude/registerKillSessionHandler.ts index 37936b79bc..b42b9b49fc 100644 --- a/cli/src/claude/registerKillSessionHandler.ts +++ b/cli/src/claude/registerKillSessionHandler.ts @@ -11,18 +11,41 @@ interface KillSessionResponse { message: string; } +/** + * tiann/hapi#914: callers can pass either a bare `cleanupAndExit` closure + * (legacy) or an options object that lets the kill-RPC stamp an explicit + * `archiveReason` before the lifecycle teardown runs. The hub only sends + * KillSession when the operator clicked Archive in the UI, so this RPC is + * the authoritative "user-terminated" signal; out-of-band SIGTERM from a + * hub-restart cascade no longer collides with the default archive reason. + */ +export interface KillSessionLifecycle { + cleanupAndExit: () => Promise; + setArchiveReason?: (reason: string) => void; +} export function registerKillSessionHandler( rpcHandlerManager: RpcHandlerManager, - killThisHappy: () => Promise + lifecycleOrCleanup: KillSessionLifecycle | (() => Promise) ) { + const lifecycle: KillSessionLifecycle = typeof lifecycleOrCleanup === 'function' + ? { cleanupAndExit: lifecycleOrCleanup } + : lifecycleOrCleanup; + rpcHandlerManager.registerHandler(RPC_METHODS.KillSession, async () => { logger.debug('Kill session request received'); + // tiann/hapi#914: stamp the archive reason from the RPC path so the + // default in `runnerLifecycle.ts` can be reassigned away from + // 'User terminated'. A hub-restart-cascade SIGTERM does NOT go + // through this handler — it hits the SIGTERM signal handler — so + // those archives now stay labelled `'Hub restart'` (the new default). + lifecycle.setArchiveReason?.('User terminated'); + // This will start the cleanup process - void killThisHappy(); + void lifecycle.cleanupAndExit(); - // We should still be able to respond the the client, though they + // We should still be able to respond to the client, though they // should optimistically assume the session is dead. return { success: true, diff --git a/cli/src/claude/runClaude.ts b/cli/src/claude/runClaude.ts index 4472aee92d..1ebb1601e5 100644 --- a/cli/src/claude/runClaude.ts +++ b/cli/src/claude/runClaude.ts @@ -145,7 +145,7 @@ export async function runClaude(options: StartOptions = {}): Promise { }); lifecycle.registerProcessHandlers(); - registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit); + registerKillSessionHandler(session.rpcHandlerManager, lifecycle); registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle); // Set initial agent state diff --git a/cli/src/codex/runCodex.ts b/cli/src/codex/runCodex.ts index bda6f08c53..cc7d5e6006 100644 --- a/cli/src/codex/runCodex.ts +++ b/cli/src/codex/runCodex.ts @@ -89,7 +89,7 @@ export async function runCodex(opts: { }); lifecycle.registerProcessHandlers(); - registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit); + registerKillSessionHandler(session.rpcHandlerManager, lifecycle); registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle); const applyCurrentConfigToSession = (options?: { syncModel?: boolean }) => { diff --git a/cli/src/cursor/cursorAcpRemoteLauncher.test.ts b/cli/src/cursor/cursorAcpRemoteLauncher.test.ts index 3aeb747389..1267ae9bda 100644 --- a/cli/src/cursor/cursorAcpRemoteLauncher.test.ts +++ b/cli/src/cursor/cursorAcpRemoteLauncher.test.ts @@ -143,6 +143,7 @@ function makeSession(sessionId: string | null): CursorSession { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn() @@ -267,11 +268,68 @@ describe('cursorAcpRemoteLauncher', () => { expect(session.onSessionFoundWithProtocol).toHaveBeenCalledWith('new-acp-session', 'acp'); }); + // tiann/hapi#913: fresh ACP sessions previously persisted `cursorSessionId` + // via fire-and-forget `updateMetadata`. A SIGTERM within ~1s of the first + // turn (hub-restart cascade) could strand the session because the ACK + // never arrived. The fix awaits `client.flushMetadata()` between + // `onSessionFoundWithProtocol` and the main loop, gating turn processing + // on a durable persist. + it('awaits flushMetadata after registering a fresh cursorSessionId so SIGTERM cannot strand the session', async () => { + const session = makeSession(null); + const flushSpy = vi.fn(async () => true); + // Replace the mock fixture's flushMetadata so we can observe ordering. + (session.client as unknown as { flushMetadata: typeof flushSpy }).flushMetadata = flushSpy; + + let flushCalled = false; + flushSpy.mockImplementation(async () => { + flushCalled = true; + return true; + }); + + const onSessionFoundSpy = session.onSessionFoundWithProtocol as ReturnType; + let onSessionFoundCalledBeforeFlush = false; + onSessionFoundSpy.mockImplementation(() => { + if (!flushCalled) { + onSessionFoundCalledBeforeFlush = true; + } + }); + + await cursorAcpRemoteLauncher(session); + + expect(onSessionFoundCalledBeforeFlush).toBe(true); + expect(flushSpy).toHaveBeenCalled(); + }); + + it('preserves the #834 resume-path pre-registration shape (registration before backend.loadSession)', async () => { + // PR #834 pre-registers `cursorSessionId` BEFORE `backend.loadSession` + // so a load-session failure on a legacy store does not strand the + // session. The #913 fix must not relocate or remove that + // pre-registration. We verify by observing call ordering on the spy. + const session = makeSession('resume-acp-session'); + const onSessionFoundSpy = session.onSessionFoundWithProtocol as ReturnType; + + let preRegisterCalledBeforeLoadSession = false; + let preRegisterArgs: unknown[] | null = null; + onSessionFoundSpy.mockImplementation((id: string, protocol: string) => { + if (!harness.loadSessionCalled) { + preRegisterCalledBeforeLoadSession = true; + preRegisterArgs = [id, protocol]; + } + }); + + await cursorAcpRemoteLauncher(session); + + expect(preRegisterCalledBeforeLoadSession).toBe(true); + expect(preRegisterArgs).toEqual(['resume-acp-session', 'acp']); + expect(harness.loadSessionCalled).toBe(true); + }); + it('applies debug mode immediately when setPermissionMode is called', async () => { const queue = new MessageQueue2((mode) => mode.permissionMode); const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn() @@ -316,6 +374,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive @@ -360,6 +419,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive @@ -407,6 +467,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn() @@ -451,6 +512,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn() @@ -507,6 +569,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive @@ -548,6 +611,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn() @@ -593,6 +657,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn() @@ -633,6 +698,7 @@ describe('cursorAcpRemoteLauncher', () => { const client = { rpcHandlerManager: { registerHandler: vi.fn() }, updateMetadata: vi.fn(), + flushMetadata: vi.fn(async () => true), sendSessionEvent: vi.fn(), sendAgentMessage: vi.fn(), keepAlive: vi.fn(), diff --git a/cli/src/cursor/cursorAcpRemoteLauncher.ts b/cli/src/cursor/cursorAcpRemoteLauncher.ts index 58b611bee1..b98e37cf94 100644 --- a/cli/src/cursor/cursorAcpRemoteLauncher.ts +++ b/cli/src/cursor/cursorAcpRemoteLauncher.ts @@ -123,6 +123,18 @@ class CursorAcpRemoteLauncher extends RemoteLauncherBase { if (acpSessionId !== resumeSessionId) { session.onSessionFoundWithProtocol(acpSessionId, 'acp'); + // tiann/hapi#913: block until the metadata write that pins + // `cursorSessionId` reaches the hub DB before we drop into + // `runMainLoop`. If SIGTERM (hub-restart cascade) lands during + // the first turn without this gate, the only durable handle + // linking the session to its on-disk ACP store is lost and the + // session strands. The resume path at lines 98-100 already + // relies on the latency of `backend.loadSession()` to flush the + // same write; the fresh-session path has no such cover. + const flushed = await session.client.flushMetadata(); + if (!flushed) { + logger.warn(`[cursor-acp] cursorSessionId metadata write did not ACK within 5s; session may be unrecoverable if killed before the lock drains (acpSessionId=${acpSessionId})`); + } } syncCursorModelsFromAcp(backend, acpSessionId); diff --git a/cli/src/cursor/runCursor.ts b/cli/src/cursor/runCursor.ts index f5508f3429..a855a071fa 100644 --- a/cli/src/cursor/runCursor.ts +++ b/cli/src/cursor/runCursor.ts @@ -81,7 +81,7 @@ export async function runCursor(opts: { }); lifecycle.registerProcessHandlers(); - registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit); + registerKillSessionHandler(session.rpcHandlerManager, lifecycle); registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle); const syncSessionMode = () => { diff --git a/cli/src/gemini/runGemini.ts b/cli/src/gemini/runGemini.ts index 34b13026e2..c2e667225e 100644 --- a/cli/src/gemini/runGemini.ts +++ b/cli/src/gemini/runGemini.ts @@ -113,7 +113,7 @@ export async function runGemini(opts: { }); lifecycle.registerProcessHandlers(); - registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit); + registerKillSessionHandler(session.rpcHandlerManager, lifecycle); registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle); const syncSessionMode = () => { diff --git a/cli/src/kimi/runKimi.ts b/cli/src/kimi/runKimi.ts index 97cc3703bc..f148b880de 100644 --- a/cli/src/kimi/runKimi.ts +++ b/cli/src/kimi/runKimi.ts @@ -82,7 +82,7 @@ export async function runKimi(opts: { }); lifecycle.registerProcessHandlers(); - registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit); + registerKillSessionHandler(session.rpcHandlerManager, lifecycle); registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle); const syncSessionMode = () => { diff --git a/cli/src/opencode/runOpencode.ts b/cli/src/opencode/runOpencode.ts index a78f931195..3e89e02a80 100644 --- a/cli/src/opencode/runOpencode.ts +++ b/cli/src/opencode/runOpencode.ts @@ -107,7 +107,7 @@ export async function runOpencode(opts: { }); lifecycle.registerProcessHandlers(); - registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit); + registerKillSessionHandler(session.rpcHandlerManager, lifecycle); registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle); const syncSessionMode = () => { diff --git a/hub/src/sync/rpcGateway.test.ts b/hub/src/sync/rpcGateway.test.ts index 8c98fad941..d0825c0d68 100644 --- a/hub/src/sync/rpcGateway.test.ts +++ b/hub/src/sync/rpcGateway.test.ts @@ -1,7 +1,7 @@ import { describe, expect, it } from 'bun:test' import type { Server } from 'socket.io' import type { RpcRegistry } from '../socket/rpcRegistry' -import { RpcGateway } from './rpcGateway' +import { RpcGateway, RpcTargetMissingError } from './rpcGateway' function createGateway() { const timeouts: number[] = [] @@ -70,3 +70,48 @@ describe('RpcGateway RPC timeouts', () => { }) }) +// tiann/hapi#916: rpcCall throws a typed `RpcTargetMissingError` when the +// target CLI is unreachable, so syncEngine.archiveSession can narrow on it +// and treat the kill as a benign no-op. +describe('RpcGateway no-target diagnostics (tiann/hapi#916)', () => { + it('throws RpcTargetMissingError(handler-not-registered) when no socket is registered for the method', async () => { + const io = { + of() { + return { + sockets: { + get() { return undefined } + } + } + } + } as unknown as Server + const rpcRegistry = { + getSocketIdForMethod() { return undefined } + } as unknown as RpcRegistry + const gateway = new RpcGateway(io, rpcRegistry) + + const error = await gateway.killSession('session-1').catch((e: unknown) => e) + expect(error).toBeInstanceOf(RpcTargetMissingError) + expect((error as RpcTargetMissingError).code).toBe('handler-not-registered') + }) + + it('throws RpcTargetMissingError(socket-disconnected) when the socket id is registered but no socket exists', async () => { + const io = { + of() { + return { + sockets: { + get() { return undefined } + } + } + } + } as unknown as Server + const rpcRegistry = { + getSocketIdForMethod() { return 'socket-1' } + } as unknown as RpcRegistry + const gateway = new RpcGateway(io, rpcRegistry) + + const error = await gateway.killSession('session-1').catch((e: unknown) => e) + expect(error).toBeInstanceOf(RpcTargetMissingError) + expect((error as RpcTargetMissingError).code).toBe('socket-disconnected') + }) +}) + diff --git a/hub/src/sync/rpcGateway.ts b/hub/src/sync/rpcGateway.ts index 8e03894014..1575d40365 100644 --- a/hub/src/sync/rpcGateway.ts +++ b/hub/src/sync/rpcGateway.ts @@ -24,6 +24,27 @@ import type { RpcRegistry } from '../socket/rpcRegistry' const DEFAULT_RPC_TIMEOUT_MS = 30_000 const MODEL_LIST_RPC_TIMEOUT_MS = 120_000 +/** + * tiann/hapi#916: thrown by {@link RpcGateway.rpcCall} when the target CLI is + * unreachable (handler not registered or socket disconnected). Callers can + * narrow on this to treat "CLI gone" as a benign condition (e.g. archive + * still succeeds at the hub level) without swallowing real RPC errors like + * timeouts or protocol failures. + */ +export class RpcTargetMissingError extends Error { + readonly code: 'handler-not-registered' | 'socket-disconnected' + readonly method: string + + constructor(method: string, reason: 'handler-not-registered' | 'socket-disconnected') { + super(reason === 'handler-not-registered' + ? `RPC handler not registered: ${method}` + : `RPC socket disconnected: ${method}`) + this.name = 'RpcTargetMissingError' + this.code = reason + this.method = method + } +} + export type RpcCommandResponse = CommandResponse export type RpcReadFileResponse = FileReadResponse export type RpcGeneratedImageResponse = GeneratedImageResponse @@ -285,12 +306,12 @@ export class RpcGateway { private async rpcCall(method: string, params: unknown, timeoutMs: number = DEFAULT_RPC_TIMEOUT_MS): Promise { const socketId = this.rpcRegistry.getSocketIdForMethod(method) if (!socketId) { - throw new Error(`RPC handler not registered: ${method}`) + throw new RpcTargetMissingError(method, 'handler-not-registered') } const socket = this.io.of('/cli').sockets.get(socketId) if (!socket) { - throw new Error(`RPC socket disconnected: ${method}`) + throw new RpcTargetMissingError(method, 'socket-disconnected') } const response = await socket.timeout(timeoutMs).emitWithAck('rpc-request', { diff --git a/hub/src/sync/sessionCache.ts b/hub/src/sync/sessionCache.ts index ecfabd7e3f..ae91ca92ce 100644 --- a/hub/src/sync/sessionCache.ts +++ b/hub/src/sync/sessionCache.ts @@ -7,6 +7,11 @@ import { extractTodoWriteTodosFromMessageContent, TodosSchema } from './todos' import { extractBackgroundTaskDelta } from './backgroundTasks' const QUEUED_MESSAGE_THINKING_GRACE_MS = 15_000 +// tiann/hapi#919: metadata writers (renameSession, clearSessionArchiveMetadata, +// restoreSessionArchiveMetadata) retry on version-mismatch with a fresh cache +// snapshot. Cap retries so genuine concurrent contention still surfaces to the +// HTTP caller as 409 instead of spinning forever. +const METADATA_RETRY_ATTEMPTS = 5 type RuntimeConfigKey = 'permissionMode' | 'model' | 'modelReasoningEffort' | 'effort' | 'collaborationMode' export class SessionCache { @@ -484,32 +489,95 @@ export class SessionCache { return updatedAt !== undefined && payloadTime < updatedAt } - async renameSession(sessionId: string, name: string): Promise { - const session = this.sessions.get(sessionId) - if (!session) { - throw new Error('Session not found') - } + /** + * tiann/hapi#916: hub-side write of the archive-metadata fields normally + * authored by the CLI's `archiveAndClose`. Called by `syncEngine.archiveSession` + * when the kill-RPC fails because the CLI is unreachable (e.g. the + * hub-restart cascade already killed it). Without this, the route would + * either 500 (pre-fix) or silently return ok=true while leaving + * `lifecycleState=running` on disk — both confuse the operator. + * + * Idempotent: if `lifecycleState` is already `archived` we return without + * touching the row to avoid resetting `lifecycleStateSince`. Best-effort: + * if every retry hits `version-mismatch` (genuine contention) the original + * `archiveSession` flow still marks the session inactive in cache via + * `handleSessionEnd`, just without flipping the persisted lifecycle. + */ + markSessionArchivedFromHub(sessionId: string, reason: string): void { + for (let attempt = 0; attempt < METADATA_RETRY_ATTEMPTS; attempt += 1) { + const session = this.sessions.get(sessionId) ?? this.refreshSession(sessionId) + if (!session) return + const current = session.metadata + if (!current) return + if (current.lifecycleState === 'archived') { + return + } - const currentMetadata = session.metadata ?? { path: '', host: '' } - const newMetadata = { ...currentMetadata, name } + const next: Record = { + ...current, + lifecycleState: 'archived', + lifecycleStateSince: Date.now(), + archivedBy: 'hub', + archiveReason: reason + } - const result = this.store.sessions.updateSessionMetadata( - sessionId, - newMetadata, - session.metadataVersion, - session.namespace, - { touchUpdatedAt: false } - ) + const result = this.store.sessions.updateSessionMetadata( + sessionId, + next, + session.metadataVersion, + session.namespace, + { touchUpdatedAt: false } + ) - if (result.result === 'error') { - throw new Error('Failed to update session metadata') + if (result.result === 'error') { + return + } + + if (result.result === 'success') { + this.refreshSession(sessionId) + return + } + + this.refreshSession(sessionId) } + } + + async renameSession(sessionId: string, name: string): Promise { + // tiann/hapi#919: retry-with-refresh on version-mismatch instead of + // throwing on the first contention. Mirrors the good pattern in + // mergeSessions (~L780) and in syncEngine's metadata helpers. Without + // this, a stale cache snapshot produces forever-409 on PATCH /sessions/:id + // until some unrelated event triggers a refresh. + for (let attempt = 0; attempt < METADATA_RETRY_ATTEMPTS; attempt += 1) { + const session = this.sessions.get(sessionId) ?? this.refreshSession(sessionId) + if (!session) { + throw new Error('Session not found') + } + + const currentMetadata = session.metadata ?? { path: '', host: '' } + const newMetadata = { ...currentMetadata, name } + + const result = this.store.sessions.updateSessionMetadata( + sessionId, + newMetadata, + session.metadataVersion, + session.namespace, + { touchUpdatedAt: false } + ) + + if (result.result === 'error') { + throw new Error('Failed to update session metadata') + } + + if (result.result === 'success') { + this.refreshSession(sessionId) + return + } - if (result.result === 'version-mismatch') { - throw new Error('Session was modified concurrently. Please try again.') + this.refreshSession(sessionId) } - this.refreshSession(sessionId) + throw new Error('Session was modified concurrently. Please try again.') } /** @@ -525,52 +593,59 @@ export class SessionCache { * No-op when metadata is null (callers should pre-check). */ async clearSessionArchiveMetadata(sessionId: string): Promise<{ cursorSessionProtocol?: 'acp' | 'stream-json' }> { - const session = this.sessions.get(sessionId) - if (!session) { - throw new Error('Session not found') - } - - const currentMetadata = session.metadata - if (!currentMetadata) { - throw new Error('Session metadata missing') - } + // tiann/hapi#919: retry-with-refresh on version-mismatch. The reopen + // flow runs this on every archived-session resume — a stale snapshot + // here used to forever-409 the only reopen affordance. + for (let attempt = 0; attempt < METADATA_RETRY_ATTEMPTS; attempt += 1) { + const session = this.sessions.get(sessionId) ?? this.refreshSession(sessionId) + if (!session) { + throw new Error('Session not found') + } - const next: Record = { ...currentMetadata } - delete next.lifecycleState - delete next.archivedBy - delete next.archiveReason - next.lifecycleStateSince = Date.now() + const currentMetadata = session.metadata + if (!currentMetadata) { + throw new Error('Session metadata missing') + } - let cursorSessionProtocol: 'acp' | 'stream-json' | undefined - if (currentMetadata.flavor === 'cursor') { - const existing = currentMetadata.cursorSessionProtocol - if (existing === 'acp' || existing === 'stream-json') { - cursorSessionProtocol = existing - } else if (currentMetadata.cursorSessionId) { - // Pre-#799 default: presence of cursorSessionId without protocol means stream-json. - cursorSessionProtocol = 'stream-json' - next.cursorSessionProtocol = 'stream-json' + const next: Record = { ...currentMetadata } + delete next.lifecycleState + delete next.archivedBy + delete next.archiveReason + next.lifecycleStateSince = Date.now() + + let cursorSessionProtocol: 'acp' | 'stream-json' | undefined + if (currentMetadata.flavor === 'cursor') { + const existing = currentMetadata.cursorSessionProtocol + if (existing === 'acp' || existing === 'stream-json') { + cursorSessionProtocol = existing + } else if (currentMetadata.cursorSessionId) { + // Pre-#799 default: presence of cursorSessionId without protocol means stream-json. + cursorSessionProtocol = 'stream-json' + next.cursorSessionProtocol = 'stream-json' + } } - } - const result = this.store.sessions.updateSessionMetadata( - sessionId, - next, - session.metadataVersion, - session.namespace, - { touchUpdatedAt: false } - ) + const result = this.store.sessions.updateSessionMetadata( + sessionId, + next, + session.metadataVersion, + session.namespace, + { touchUpdatedAt: false } + ) - if (result.result === 'error') { - throw new Error('Failed to update session metadata') - } + if (result.result === 'error') { + throw new Error('Failed to update session metadata') + } + + if (result.result === 'success') { + this.refreshSession(sessionId) + return cursorSessionProtocol ? { cursorSessionProtocol } : {} + } - if (result.result === 'version-mismatch') { - throw new Error('Session was modified concurrently. Please try again.') + this.refreshSession(sessionId) } - this.refreshSession(sessionId) - return cursorSessionProtocol ? { cursorSessionProtocol } : {} + throw new Error('Session was modified concurrently. Please try again.') } /** @@ -594,50 +669,59 @@ export class SessionCache { lifecycleStateSince?: number } ): Promise { - const session = this.sessions.get(sessionId) - if (!session) return - const current = session.metadata - if (!current) return + // tiann/hapi#919: retry-with-refresh on version-mismatch. This is the + // /reopen rollback path — if it fails the session is left in a + // half-cleared archive state, so making it robust to a stale snapshot + // matters more here than for the other two. + for (let attempt = 0; attempt < METADATA_RETRY_ATTEMPTS; attempt += 1) { + const session = this.sessions.get(sessionId) ?? this.refreshSession(sessionId) + if (!session) return + const current = session.metadata + if (!current) return + + const next: Record = { ...current } + if (snapshot.lifecycleState !== undefined) { + next.lifecycleState = snapshot.lifecycleState + } else { + delete next.lifecycleState + } + if (snapshot.archivedBy !== undefined) { + next.archivedBy = snapshot.archivedBy + } else { + delete next.archivedBy + } + if (snapshot.archiveReason !== undefined) { + next.archiveReason = snapshot.archiveReason + } else { + delete next.archiveReason + } + if (snapshot.lifecycleStateSince !== undefined) { + next.lifecycleStateSince = snapshot.lifecycleStateSince + } else { + delete next.lifecycleStateSince + } - const next: Record = { ...current } - if (snapshot.lifecycleState !== undefined) { - next.lifecycleState = snapshot.lifecycleState - } else { - delete next.lifecycleState - } - if (snapshot.archivedBy !== undefined) { - next.archivedBy = snapshot.archivedBy - } else { - delete next.archivedBy - } - if (snapshot.archiveReason !== undefined) { - next.archiveReason = snapshot.archiveReason - } else { - delete next.archiveReason - } - if (snapshot.lifecycleStateSince !== undefined) { - next.lifecycleStateSince = snapshot.lifecycleStateSince - } else { - delete next.lifecycleStateSince - } + const result = this.store.sessions.updateSessionMetadata( + sessionId, + next, + session.metadataVersion, + session.namespace, + { touchUpdatedAt: false } + ) - const result = this.store.sessions.updateSessionMetadata( - sessionId, - next, - session.metadataVersion, - session.namespace, - { touchUpdatedAt: false } - ) + if (result.result === 'error') { + throw new Error('Failed to restore archive metadata') + } - if (result.result === 'error') { - throw new Error('Failed to restore archive metadata') - } + if (result.result === 'success') { + this.refreshSession(sessionId) + return + } - if (result.result === 'version-mismatch') { - throw new Error('Session was modified concurrently during reopen rollback') + this.refreshSession(sessionId) } - this.refreshSession(sessionId) + throw new Error('Session was modified concurrently during reopen rollback') } async deleteSession(sessionId: string): Promise { diff --git a/hub/src/sync/sessionModel.test.ts b/hub/src/sync/sessionModel.test.ts index 4ec57f5a17..f7584915b7 100644 --- a/hub/src/sync/sessionModel.test.ts +++ b/hub/src/sync/sessionModel.test.ts @@ -2115,4 +2115,213 @@ describe('session model', () => { })).resolves.toBeUndefined() }) }) + + // tiann/hapi#916: when the CLI is gone, the kill-RPC throws + // RpcTargetMissingError. markSessionArchivedFromHub writes the archive + // metadata directly so the row's lifecycleState still flips to 'archived'. + describe('markSessionArchivedFromHub (tiann/hapi#916)', () => { + it('flips lifecycleState to archived with archivedBy=hub and the supplied reason', () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-hub-archive', + { path: '/tmp/project', host: 'localhost', flavor: 'codex', codexSessionId: 'thread-1' }, + null, + 'default' + ) + + cache.markSessionArchivedFromHub(session.id, 'Archived from hub (CLI unreachable)') + + const meta = cache.getSession(session.id)?.metadata as Record | null | undefined + expect(meta?.lifecycleState).toBe('archived') + expect(meta?.archivedBy).toBe('hub') + expect(meta?.archiveReason).toBe('Archived from hub (CLI unreachable)') + expect(typeof meta?.lifecycleStateSince).toBe('number') + }) + + it('is idempotent for already-archived sessions (does not reset lifecycleStateSince)', async () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const initialSince = 1700000000000 + const session = cache.getOrCreateSession( + 'session-already-archived', + { + path: '/tmp/project', + host: 'localhost', + flavor: 'codex', + lifecycleState: 'archived', + archivedBy: 'cli', + archiveReason: 'User terminated', + lifecycleStateSince: initialSince + }, + null, + 'default' + ) + + cache.markSessionArchivedFromHub(session.id, 'Should not overwrite') + + const meta = cache.getSession(session.id)?.metadata as Record | null | undefined + expect(meta?.lifecycleState).toBe('archived') + expect(meta?.archivedBy).toBe('cli') + expect(meta?.archiveReason).toBe('User terminated') + expect(meta?.lifecycleStateSince).toBe(initialSince) + }) + + it('self-heals on version-mismatch via refresh-and-retry', () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-hub-archive-stale', + { path: '/tmp/project', host: 'localhost', flavor: 'codex' }, + null, + 'default' + ) + + const dbSession = store.sessions.getSessionByNamespace(session.id, 'default')! + const oobWrite = store.sessions.updateSessionMetadata( + session.id, + { ...dbSession.metadata!, name: 'oob' }, + dbSession.metadataVersion, + 'default', + { touchUpdatedAt: false } + ) + expect(oobWrite.result).toBe('success') + + cache.markSessionArchivedFromHub(session.id, 'CLI unreachable') + + const meta = cache.getSession(session.id)?.metadata as Record | null | undefined + expect(meta?.lifecycleState).toBe('archived') + expect(meta?.archivedBy).toBe('hub') + expect(meta?.name).toBe('oob') + }) + }) + + // tiann/hapi#919: the three metadata writers must self-heal on + // version-mismatch instead of one-shot-throwing. The bug was that a + // stale cache snapshot produced forever-409 on the corresponding HTTP + // endpoints — the cache never refreshed, so the same retry hit the + // same mismatch. Pattern mirrors mergeSessions (line ~780). + describe('version-mismatch self-heal (tiann/hapi#919)', () => { + it('renameSession recovers after a stale cache snapshot is detected', async () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-rename-stale', + { path: '/tmp/project', host: 'localhost', flavor: 'codex' }, + null, + 'default' + ) + + // Simulate a concurrent writer bumping the DB version under our feet: + // write a metadata patch out-of-band via the store, leaving the cache + // snapshot stale. + const dbSession = store.sessions.getSessionByNamespace(session.id, 'default')! + const oobWrite = store.sessions.updateSessionMetadata( + session.id, + { ...dbSession.metadata!, name: 'concurrent-rename' }, + dbSession.metadataVersion, + 'default', + { touchUpdatedAt: false } + ) + expect(oobWrite.result).toBe('success') + + // Cache still holds the pre-OOB snapshot. Pre-fix, this call threw + // 'Session was modified concurrently'. Post-fix, it refreshes and + // succeeds. + await expect(cache.renameSession(session.id, 'final-name')).resolves.toBeUndefined() + + const meta = cache.getSession(session.id)?.metadata as Record | null | undefined + expect(meta?.name).toBe('final-name') + }) + + it('clearSessionArchiveMetadata recovers after a stale cache snapshot', async () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-clear-stale', + { + path: '/tmp/project', + host: 'localhost', + flavor: 'codex', + codexSessionId: 'thread-stale', + lifecycleState: 'archived', + archivedBy: 'cli', + archiveReason: 'User terminated' + }, + null, + 'default' + ) + + // Concurrent rename via the store bumps the DB version. + const dbSession = store.sessions.getSessionByNamespace(session.id, 'default')! + const oobWrite = store.sessions.updateSessionMetadata( + session.id, + { ...dbSession.metadata!, name: 'oob-name' }, + dbSession.metadataVersion, + 'default', + { touchUpdatedAt: false } + ) + expect(oobWrite.result).toBe('success') + + await expect(cache.clearSessionArchiveMetadata(session.id)).resolves.toBeDefined() + + const meta = cache.getSession(session.id)?.metadata as Record | null | undefined + expect(meta?.lifecycleState).toBeUndefined() + expect(meta?.archivedBy).toBeUndefined() + expect(meta?.name).toBe('oob-name') + }) + + it('restoreSessionArchiveMetadata recovers after a stale cache snapshot', async () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-restore-stale', + { + path: '/tmp/project', + host: 'localhost', + flavor: 'codex', + codexSessionId: 'thread-restore-stale' + // Started without archive metadata - simulates the post-clear state. + }, + null, + 'default' + ) + + // Concurrent unrelated write bumps DB version. + const dbSession = store.sessions.getSessionByNamespace(session.id, 'default')! + const oobWrite = store.sessions.updateSessionMetadata( + session.id, + { ...dbSession.metadata!, name: 'parallel-rename' }, + dbSession.metadataVersion, + 'default', + { touchUpdatedAt: false } + ) + expect(oobWrite.result).toBe('success') + + await expect(cache.restoreSessionArchiveMetadata(session.id, { + lifecycleState: 'archived', + archivedBy: 'cli', + archiveReason: 'User terminated', + lifecycleStateSince: 1234 + })).resolves.toBeUndefined() + + const meta = cache.getSession(session.id)?.metadata as Record | null | undefined + expect(meta?.lifecycleState).toBe('archived') + expect(meta?.archiveReason).toBe('User terminated') + expect(meta?.lifecycleStateSince).toBe(1234) + expect(meta?.name).toBe('parallel-rename') + }) + }) }) diff --git a/hub/src/sync/syncEngine.ts b/hub/src/sync/syncEngine.ts index a7f88c032a..cc5a8aea73 100644 --- a/hub/src/sync/syncEngine.ts +++ b/hub/src/sync/syncEngine.ts @@ -23,6 +23,7 @@ import { MachineCache, type Machine } from './machineCache' import { MessageService } from './messageService' import { RpcGateway, + RpcTargetMissingError, type RpcCodexModel, type RpcCommandResponse, type RpcDeleteUploadResponse, @@ -428,7 +429,24 @@ export class SyncEngine { } async archiveSession(sessionId: string): Promise { - await this.rpcGateway.killSession(sessionId) + // tiann/hapi#916: when the CLI is already gone (e.g. after a + // hub-restart cascade SIGTERMed the runner but the in-memory + // `active` flag has not been reconciled yet) the kill-RPC throws + // and the route used to surface that as HTTP 500. Treat the + // missing target as a benign condition: still flip the session's + // lifecycleState to `archived` in the hub-side metadata so the + // UI does not see a half-cleaned zombie, and continue to mark + // it inactive in the cache. Real RPC errors (timeout, protocol + // failure) still propagate as 5xx. + try { + await this.rpcGateway.killSession(sessionId) + } catch (error) { + if (error instanceof RpcTargetMissingError) { + this.sessionCache.markSessionArchivedFromHub(sessionId, 'Archived from hub (CLI unreachable)') + } else { + throw error + } + } this.handleSessionEnd({ sid: sessionId, time: Date.now() }) } diff --git a/hub/src/web/routes/sessions.test.ts b/hub/src/web/routes/sessions.test.ts index 6ec0be8348..5822a40502 100644 --- a/hub/src/web/routes/sessions.test.ts +++ b/hub/src/web/routes/sessions.test.ts @@ -61,6 +61,7 @@ function createApp(session: Session, opts?: { listSlashCommands?: SyncEngine['listSlashCommands'] getSessionExport?: (sessionId: string, session: Session) => unknown sessionExists?: boolean + archiveSession?: (sessionId: string) => Promise }) { const applySessionConfigCalls: Array<[string, Record]> = [] const applySessionConfig = async (sessionId: string, config: Record) => { @@ -103,6 +104,7 @@ function createApp(session: Session, opts?: { resumed: true })) const sessionExists = opts?.sessionExists !== false + const archiveSessionMock = opts?.archiveSession ?? (async () => {}) const engine = { resolveSessionAccess: () => sessionExists ? { ok: true, sessionId: session.id, session } @@ -114,6 +116,7 @@ function createApp(session: Session, opts?: { listOpencodeReasoningEffortOptionsForSession, resumeSession, reopenSession, + archiveSession: archiveSessionMock, getSessionExport: opts?.getSessionExport ?? (() => ({ type: 'success', payload: { @@ -948,4 +951,88 @@ describe('sessions routes', () => { }) }) + // tiann/hapi#916: archive endpoint must be idempotent for already-archived + // rows and for split-brain rows whose CLI is gone but the in-memory `active` + // flag has not been reconciled to false yet. + describe('POST /sessions/:id/archive (tiann/hapi#916)', () => { + it('returns 2xx and calls archiveSession for an active session', async () => { + const calls: string[] = [] + const session = createSession({ active: true }) + const { app } = createApp(session, { + archiveSession: async (sessionId: string) => { calls.push(sessionId) } + }) + + const response = await app.request('/api/sessions/session-1/archive', { method: 'POST' }) + + expect(response.status).toBe(200) + expect(await response.json()).toEqual({ ok: true }) + expect(calls).toEqual(['session-1']) + }) + + it('returns 2xx and skips archiveSession when the row is already archived (idempotent)', async () => { + let called = false + const session = createSession({ + active: false, + metadata: { + path: '/tmp/project', + host: 'localhost', + flavor: 'codex', + lifecycleState: 'archived', + archivedBy: 'cli', + archiveReason: 'User terminated' + } + }) + const { app } = createApp(session, { + archiveSession: async () => { called = true } + }) + + const response = await app.request('/api/sessions/session-1/archive', { method: 'POST' }) + + expect(response.status).toBe(200) + expect(await response.json()).toEqual({ ok: true, alreadyArchived: true }) + expect(called).toBe(false) + }) + + it('returns 2xx when the active session\'s CLI is gone — engine.archiveSession swallows the missing-RPC error', async () => { + // Pre-fix this returned 500 because rpcGateway.killSession threw + // 'RPC handler not registered'. Post-fix the engine narrows on + // RpcTargetMissingError and still flips lifecycle to archived. + const session = createSession({ active: true }) + const { app } = createApp(session, { + archiveSession: async () => { + // Simulates the post-fix behavior: engine catches the + // RpcTargetMissingError, calls markSessionArchivedFromHub, + // and returns normally. + } + }) + + const response = await app.request('/api/sessions/session-1/archive', { method: 'POST' }) + + expect(response.status).toBe(200) + expect(await response.json()).toEqual({ ok: true }) + }) + + it('still surfaces a 5xx for non-RPC errors (e.g. DB write failure)', async () => { + const session = createSession({ active: true }) + const { app } = createApp(session, { + archiveSession: async () => { + throw new Error('DB write failed') + } + }) + + const response = await app.request('/api/sessions/session-1/archive', { method: 'POST' }) + expect(response.status).toBe(500) + }) + + it('returns 404 when the session id is unknown', async () => { + const session = createSession() + const { app } = createApp(session, { sessionExists: false }) + + const response = await app.request('/api/sessions/missing-id/archive', { method: 'POST' }) + + expect(response.status).toBe(404) + expect(await response.json()).toEqual({ error: 'Session not found' }) + }) + }) + }) diff --git a/hub/src/web/routes/sessions.ts b/hub/src/web/routes/sessions.ts index 3725a3bd10..315f20f4b2 100644 --- a/hub/src/web/routes/sessions.ts +++ b/hub/src/web/routes/sessions.ts @@ -290,16 +290,25 @@ export function createSessionsRoutes(getSyncEngine: () => SyncEngine | null): Ho }) app.post('/sessions/:id/archive', async (c) => { + // tiann/hapi#916: drop `requireActive: true` so the endpoint is + // idempotent for already-archived rows AND for live-in-cache / + // dead-on-cli split-brain rows that the operator wants to clean up + // after a hub-restart cascade. We still 404 on unknown ids via + // requireSessionFromParam, and 5xx on real RPC failures. const engine = requireSyncEngine(c, getSyncEngine) if (engine instanceof Response) { return engine } - const sessionResult = requireSessionFromParam(c, engine, { requireActive: true }) + const sessionResult = requireSessionFromParam(c, engine) if (sessionResult instanceof Response) { return sessionResult } + if (sessionResult.session.metadata?.lifecycleState === 'archived') { + return c.json({ ok: true, alreadyArchived: true }) + } + await engine.archiveSession(sessionResult.sessionId) return c.json({ ok: true }) }) From 1c8972a3e85668619a122c5bc137ad97989d1868 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Mon, 15 Jun 2026 22:00:12 +0100 Subject: [PATCH 2/5] fix(cli): runner-spawned children use 'Stopped by runner' as default archive reason Addresses bot review of #923: with the #914 default-archiveReason flip to 'Hub restart', runner-driven SIGTERM paths (`hapi runner stop-session`, webhook-timeout cleanup at run.ts:587, orphan-cleanup at run.ts:267) all mislabel as 'Hub restart' which is also inaccurate audit-trail noise. Smallest defensible change: parameterise the lifecycle default via HAPI_DEFAULT_ARCHIVE_REASON env, and have the runner set 'Stopped by runner' on spawn. Terminal-launched sessions (no runner parent, no env var) still default to 'Hub restart' since hub-restart cascade documented at #915 is the most plausible SIGTERM source for those. Explicit overrides via setArchiveReason (KillSession RPC, SIGINT Ctrl-C, markCrash uncaught exception) still win. Two new unit tests cover the env-var default and the override precedence. Refs tiann/hapi#914. Co-authored-by: Cursor --- cli/src/agent/runnerLifecycle.test.ts | 54 +++++++++++++++++++++++++++ cli/src/agent/runnerLifecycle.ts | 22 +++++++---- cli/src/runner/run.ts | 11 +++++- 3 files changed, 79 insertions(+), 8 deletions(-) diff --git a/cli/src/agent/runnerLifecycle.test.ts b/cli/src/agent/runnerLifecycle.test.ts index 7b3ac02b6f..b3fc26fa3c 100644 --- a/cli/src/agent/runnerLifecycle.test.ts +++ b/cli/src/agent/runnerLifecycle.test.ts @@ -70,4 +70,58 @@ describe('createRunnerLifecycle archiveReason defaults (tiann/hapi#914)', () => archiveReason: 'Session crashed' }) }) + + // tiann/hapi#914 review feedback: runner-spawned children should not + // get the 'Hub restart' label when the runner SIGTERMs them via + // stop-session, webhook-timeout cleanup, or orphan cleanup. The runner + // sets HAPI_DEFAULT_ARCHIVE_REASON on the child's spawn env so the + // default reason is path-accurate without changing every kill site. + it('honours HAPI_DEFAULT_ARCHIVE_REASON env as the default reason', async () => { + const session = makeFakeSession() + const original = process.env.HAPI_DEFAULT_ARCHIVE_REASON + process.env.HAPI_DEFAULT_ARCHIVE_REASON = 'Stopped by runner' + try { + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + await lifecycle.cleanup() + + expect(session.metadataWrites[0]).toMatchObject({ + archiveReason: 'Stopped by runner' + }) + } finally { + if (original === undefined) { + delete process.env.HAPI_DEFAULT_ARCHIVE_REASON + } else { + process.env.HAPI_DEFAULT_ARCHIVE_REASON = original + } + } + }) + + it('setArchiveReason still wins over HAPI_DEFAULT_ARCHIVE_REASON', async () => { + const session = makeFakeSession() + const original = process.env.HAPI_DEFAULT_ARCHIVE_REASON + process.env.HAPI_DEFAULT_ARCHIVE_REASON = 'Stopped by runner' + try { + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + lifecycle.setArchiveReason('User terminated') + await lifecycle.cleanup() + + expect(session.metadataWrites[0]).toMatchObject({ + archiveReason: 'User terminated' + }) + } finally { + if (original === undefined) { + delete process.env.HAPI_DEFAULT_ARCHIVE_REASON + } else { + process.env.HAPI_DEFAULT_ARCHIVE_REASON = original + } + } + }) }) diff --git a/cli/src/agent/runnerLifecycle.ts b/cli/src/agent/runnerLifecycle.ts index 1c25f757c8..796544c56d 100644 --- a/cli/src/agent/runnerLifecycle.ts +++ b/cli/src/agent/runnerLifecycle.ts @@ -23,16 +23,24 @@ export type RunnerLifecycle = { export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLifecycle { let exitCode = 0 - // tiann/hapi#914: default reason is 'Hub restart' (parent-driven SIGTERM - // is the most common non-user cause). Genuine user actions (clicking - // Archive in the web UI, or Ctrl-C in a local terminal) explicitly - // reassign this via `setArchiveReason` BEFORE `cleanupAndExit` runs: + // tiann/hapi#914: default reason is parent-driven SIGTERM (the most + // common non-user cause). Genuine user actions (clicking Archive in the + // web UI, or Ctrl-C in a local terminal) explicitly reassign this via + // `setArchiveReason` BEFORE `cleanupAndExit` runs: // - KillSession RPC handler → 'User terminated' (see registerKillSessionHandler) // - SIGINT handler → 'User terminated' (Ctrl-C in local terminal) // - uncaughtException/Reject → 'Session crashed' (via markCrash) - // Out-of-band SIGTERM (hub-restart cascade, `kill ` from host) keeps - // the default and is correctly labelled 'Hub restart' on the audit trail. - let archiveReason = 'Hub restart' + // + // The default is parameterised via `HAPI_DEFAULT_ARCHIVE_REASON` so that + // the runner (cli/src/runner/run.ts) can label children it spawns with + // 'Stopped by runner' — runner-driven SIGTERMs (stop-session command, + // webhook-timeout cleanup, orphan cleanup) are NOT hub restarts, and + // hub-restart cascades for runner-spawned children also flow through + // the runner in production. Terminal-launched sessions (no runner + // parent) fall back to 'Hub restart' since the most plausible + // out-of-band SIGTERM source for those is the systemd hub-restart + // cascade documented at tiann/hapi#915. + let archiveReason = process.env.HAPI_DEFAULT_ARCHIVE_REASON || 'Hub restart' let sessionEndReason: SessionEndReason = 'terminated' let cleanupStarted = false let cleanupPromise: Promise | null = null diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts index f49d3b15b1..a252522fe8 100644 --- a/cli/src/runner/run.ts +++ b/cli/src/runner/run.ts @@ -456,7 +456,16 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): stdio: ['ignore', 'pipe', 'pipe'], // Capture stdout/stderr for debugging env: { ...process.env, - ...extraEnv + ...extraEnv, + // tiann/hapi#914 review: runner-spawned children get SIGTERMed + // by three runner paths (stopSession, webhook-timeout cleanup, + // orphan cleanup). All three are runner actions, not hub + // restarts and not user terminations. Override the lifecycle + // default so audit-trail archives accurately attribute to the + // runner. Genuine user actions (web Archive via KillSession + // RPC, terminal Ctrl-C via SIGINT) still override to + // 'User terminated' in runnerLifecycle. + HAPI_DEFAULT_ARCHIVE_REASON: 'Stopped by runner' } }); From 123c625e750c2d8961174d87b17fb59a06e45afe Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 16 Jun 2026 01:48:57 +0100 Subject: [PATCH 3/5] fix(hub): markSessionArchivedFromHub surfaces persistence failures as 5xx Addresses second-round bot review of #923 (Major): `markSessionArchivedFromHub` silently returned on DB write errors and on exhausted version-retry attempts, which would let `/archive` claim 200 OK while the row stayed unarchived. That regresses the #916 acceptance criterion that non-RPC errors during archive must still propagate as 5xx. Both fall-through paths now throw, matching the contract of the sibling writers in this file (renameSession, mergeSessions). The sessionModel test suite gains two cases that spy on `store.sessions.updateSessionMetadata` to force `error` and `version-mismatch` shapes and asserts the helper throws. The existing route test at `hub/src/web/routes/sessions.test.ts:1015` already covers the route-level 500 propagation for any error thrown out of `archiveSession`, so no new route test is needed. Imports `spyOn` from `bun:test` to match this test file's runtime (the rest of the hub package uses bun:test, not vitest). Refs tiann/hapi#916. Co-authored-by: Cursor --- hub/src/sync/sessionCache.ts | 12 ++++++- hub/src/sync/sessionModel.test.ts | 54 ++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/hub/src/sync/sessionCache.ts b/hub/src/sync/sessionCache.ts index ae91ca92ce..941f4029ed 100644 --- a/hub/src/sync/sessionCache.ts +++ b/hub/src/sync/sessionCache.ts @@ -530,7 +530,11 @@ export class SessionCache { ) if (result.result === 'error') { - return + // tiann/hapi#916 review feedback: persistence failure must + // surface so the route returns 5xx. Silently returning here + // would let `/archive` claim success while the row stays + // unarchived in the DB. + throw new Error('Failed to archive session metadata from hub') } if (result.result === 'success') { @@ -540,6 +544,12 @@ export class SessionCache { this.refreshSession(sessionId) } + + // tiann/hapi#916 review feedback: exhausted retries means we never + // got a successful write. Match the renameSession / mergeSessions + // contract and surface this as an error so non-RPC failures stay + // 5xx per the issue's acceptance criteria. + throw new Error('Session was modified concurrently while archiving from hub') } async renameSession(sessionId: string, name: string): Promise { diff --git a/hub/src/sync/sessionModel.test.ts b/hub/src/sync/sessionModel.test.ts index f7584915b7..1b1429ae18 100644 --- a/hub/src/sync/sessionModel.test.ts +++ b/hub/src/sync/sessionModel.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it } from 'bun:test' +import { describe, expect, it, spyOn } from 'bun:test' import { toSessionSummary } from '@hapi/protocol' import type { SyncEvent } from '@hapi/protocol/types' import { Store } from '../store' @@ -2200,6 +2200,58 @@ describe('session model', () => { expect(meta?.archivedBy).toBe('hub') expect(meta?.name).toBe('oob') }) + + // tiann/hapi#916 review feedback: persistence failures must surface + // so the /archive route returns 5xx per the acceptance criteria + // "Non-RPC errors during archive still propagate as 5xx (DB write + // failure, etc.)" — silent return would let the route claim success + // while the row stays unarchived. + it('throws when the store reports a hard error on the metadata write', () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-hub-archive-error', + { path: '/tmp/project', host: 'localhost', flavor: 'codex' }, + null, + 'default' + ) + + const updateSpy = spyOn(store.sessions, 'updateSessionMetadata').mockReturnValue({ + result: 'error', + error: new Error('simulated DB write failure') + } as ReturnType) + + try { + expect(() => cache.markSessionArchivedFromHub(session.id, 'CLI unreachable')).toThrow(/Failed to archive session metadata from hub/) + } finally { + updateSpy.mockRestore() + } + }) + + it('throws when retries are exhausted by sustained version-mismatch contention', () => { + const store = new Store(':memory:') + const events: SyncEvent[] = [] + const cache = new SessionCache(store, createPublisher(events)) + + const session = cache.getOrCreateSession( + 'session-hub-archive-exhausted', + { path: '/tmp/project', host: 'localhost', flavor: 'codex' }, + null, + 'default' + ) + + const updateSpy = spyOn(store.sessions, 'updateSessionMetadata').mockReturnValue({ + result: 'version-mismatch' + } as ReturnType) + + try { + expect(() => cache.markSessionArchivedFromHub(session.id, 'CLI unreachable')).toThrow(/Session was modified concurrently while archiving from hub/) + } finally { + updateSpy.mockRestore() + } + }) }) // tiann/hapi#919: the three metadata writers must self-heal on From a97b9dca632e38e62ce88ac9e971b32af74b5eaf Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 16 Jun 2026 01:57:11 +0100 Subject: [PATCH 4/5] revert(cli): drop HAPI_DEFAULT_ARCHIVE_REASON env override Reverts `1c8972a3`. Bot review round 3 surfaced that the env-on-spawn approach (the bot's own round-1 suggestion shape) mislabels hub-restart-cascade SIGTERMs against runner-spawned children: systemd killcgroup on `hapi-runner.service` stop sends SIGTERM to all runner-children directly, and those would archive as 'Stopped by runner' instead of 'Hub restart'. The two suggestions are mutually incompatible without adding an IPC channel (stdio: 'ipc' on spawn) so the runner can stamp setArchiveReason via childProcess.send() before SIGTERMing. That is a refactor, not a smallest-defensible change. Going back to the simple shape: SIGTERM default is 'Hub restart' for everyone, runner-internal stop paths share that label. The audit-trail-correctness criterion from the #914 issue is met (SIGTERM no longer falsely labels as 'User terminated'). Finer attribution between cascade vs runner-stop is deferred as a follow-up. Refs tiann/hapi#914. Co-authored-by: Cursor --- cli/src/agent/runnerLifecycle.test.ts | 53 --------------------------- cli/src/agent/runnerLifecycle.ts | 31 +++++++++------- cli/src/runner/run.ts | 11 +----- 3 files changed, 18 insertions(+), 77 deletions(-) diff --git a/cli/src/agent/runnerLifecycle.test.ts b/cli/src/agent/runnerLifecycle.test.ts index b3fc26fa3c..65699c3bac 100644 --- a/cli/src/agent/runnerLifecycle.test.ts +++ b/cli/src/agent/runnerLifecycle.test.ts @@ -71,57 +71,4 @@ describe('createRunnerLifecycle archiveReason defaults (tiann/hapi#914)', () => }) }) - // tiann/hapi#914 review feedback: runner-spawned children should not - // get the 'Hub restart' label when the runner SIGTERMs them via - // stop-session, webhook-timeout cleanup, or orphan cleanup. The runner - // sets HAPI_DEFAULT_ARCHIVE_REASON on the child's spawn env so the - // default reason is path-accurate without changing every kill site. - it('honours HAPI_DEFAULT_ARCHIVE_REASON env as the default reason', async () => { - const session = makeFakeSession() - const original = process.env.HAPI_DEFAULT_ARCHIVE_REASON - process.env.HAPI_DEFAULT_ARCHIVE_REASON = 'Stopped by runner' - try { - const lifecycle = createRunnerLifecycle({ - session: session as unknown as Parameters[0]['session'], - logTag: 'test' - }) - - await lifecycle.cleanup() - - expect(session.metadataWrites[0]).toMatchObject({ - archiveReason: 'Stopped by runner' - }) - } finally { - if (original === undefined) { - delete process.env.HAPI_DEFAULT_ARCHIVE_REASON - } else { - process.env.HAPI_DEFAULT_ARCHIVE_REASON = original - } - } - }) - - it('setArchiveReason still wins over HAPI_DEFAULT_ARCHIVE_REASON', async () => { - const session = makeFakeSession() - const original = process.env.HAPI_DEFAULT_ARCHIVE_REASON - process.env.HAPI_DEFAULT_ARCHIVE_REASON = 'Stopped by runner' - try { - const lifecycle = createRunnerLifecycle({ - session: session as unknown as Parameters[0]['session'], - logTag: 'test' - }) - - lifecycle.setArchiveReason('User terminated') - await lifecycle.cleanup() - - expect(session.metadataWrites[0]).toMatchObject({ - archiveReason: 'User terminated' - }) - } finally { - if (original === undefined) { - delete process.env.HAPI_DEFAULT_ARCHIVE_REASON - } else { - process.env.HAPI_DEFAULT_ARCHIVE_REASON = original - } - } - }) }) diff --git a/cli/src/agent/runnerLifecycle.ts b/cli/src/agent/runnerLifecycle.ts index 796544c56d..96e3aa93b0 100644 --- a/cli/src/agent/runnerLifecycle.ts +++ b/cli/src/agent/runnerLifecycle.ts @@ -23,24 +23,27 @@ export type RunnerLifecycle = { export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLifecycle { let exitCode = 0 - // tiann/hapi#914: default reason is parent-driven SIGTERM (the most - // common non-user cause). Genuine user actions (clicking Archive in the - // web UI, or Ctrl-C in a local terminal) explicitly reassign this via - // `setArchiveReason` BEFORE `cleanupAndExit` runs: + // tiann/hapi#914: default reason is 'Hub restart' (parent-driven SIGTERM + // is the most common non-user cause). Genuine user actions (clicking + // Archive in the web UI, or Ctrl-C in a local terminal) explicitly + // reassign this via `setArchiveReason` BEFORE `cleanupAndExit` runs: // - KillSession RPC handler → 'User terminated' (see registerKillSessionHandler) // - SIGINT handler → 'User terminated' (Ctrl-C in local terminal) // - uncaughtException/Reject → 'Session crashed' (via markCrash) // - // The default is parameterised via `HAPI_DEFAULT_ARCHIVE_REASON` so that - // the runner (cli/src/runner/run.ts) can label children it spawns with - // 'Stopped by runner' — runner-driven SIGTERMs (stop-session command, - // webhook-timeout cleanup, orphan cleanup) are NOT hub restarts, and - // hub-restart cascades for runner-spawned children also flow through - // the runner in production. Terminal-launched sessions (no runner - // parent) fall back to 'Hub restart' since the most plausible - // out-of-band SIGTERM source for those is the systemd hub-restart - // cascade documented at tiann/hapi#915. - let archiveReason = process.env.HAPI_DEFAULT_ARCHIVE_REASON || 'Hub restart' + // Out-of-band SIGTERM (hub-restart cascade, systemd cgroup kill on + // hapi-runner.service stop, `kill ` from the operator) keeps the + // default and is correctly labelled 'Hub restart' on the audit trail. + // + // Runner-internal stop paths (`hapi runner stop-session`, webhook-timeout + // cleanup at run.ts:587, orphan cleanup at run.ts:267) also currently + // hit this default - that is technically inaccurate but follows the + // friction-mode "smallest defensible change" rule for this PR. Finer + // attribution would require an IPC channel (stdio: 'ipc' on spawn) so + // the runner can stamp `setArchiveReason` before SIGTERMing; tracked as + // a follow-up to keep this PR focussed on the user-action lie that + // motivated #914. + let archiveReason = 'Hub restart' let sessionEndReason: SessionEndReason = 'terminated' let cleanupStarted = false let cleanupPromise: Promise | null = null diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts index a252522fe8..f49d3b15b1 100644 --- a/cli/src/runner/run.ts +++ b/cli/src/runner/run.ts @@ -456,16 +456,7 @@ export async function startRunner(options: { workspaceRoots?: string[] } = {}): stdio: ['ignore', 'pipe', 'pipe'], // Capture stdout/stderr for debugging env: { ...process.env, - ...extraEnv, - // tiann/hapi#914 review: runner-spawned children get SIGTERMed - // by three runner paths (stopSession, webhook-timeout cleanup, - // orphan cleanup). All three are runner actions, not hub - // restarts and not user terminations. Override the lifecycle - // default so audit-trail archives accurately attribute to the - // runner. Genuine user actions (web Archive via KillSession - // RPC, terminal Ctrl-C via SIGINT) still override to - // 'User terminated' in runnerLifecycle. - HAPI_DEFAULT_ARCHIVE_REASON: 'Stopped by runner' + ...extraEnv } }); From 98b10319c988ff0cb841e47461e07b0cf903e3d4 Mon Sep 17 00:00:00 2001 From: HeavyGee <133152184+heavygee@users.noreply.github.com> Date: Tue, 16 Jun 2026 02:05:34 +0100 Subject: [PATCH 5/5] fix(cli): clean completions get 'Session completed', not 'Hub restart' Addresses bot review round 4 of #923 (Major): every agent runner (runClaude, runCodex, runCursor, runGemini, runKimi, runOpencode) calls setSessionEndReason('completed') on the natural exit path without touching archiveReason. With the SIGTERM default flipped to 'Hub restart', clean completions were now archived as restart cascades. Fix: setSessionEndReason flips archiveReason to 'Session completed' when it transitions to 'completed' AND no caller has already overridden the archive reason. This covers all six agent runners with a single setter change (no per-runner edits). Two new tests cover the natural-completion default and the override precedence (explicit setArchiveReason still wins). Refs tiann/hapi#914. Co-authored-by: Cursor --- cli/src/agent/runnerLifecycle.test.ts | 36 +++++++++++++++++++++++++++ cli/src/agent/runnerLifecycle.ts | 12 +++++++++ 2 files changed, 48 insertions(+) diff --git a/cli/src/agent/runnerLifecycle.test.ts b/cli/src/agent/runnerLifecycle.test.ts index 65699c3bac..c806908198 100644 --- a/cli/src/agent/runnerLifecycle.test.ts +++ b/cli/src/agent/runnerLifecycle.test.ts @@ -71,4 +71,40 @@ describe('createRunnerLifecycle archiveReason defaults (tiann/hapi#914)', () => }) }) + // tiann/hapi#914 review round 4: clean agent-loop completions + // (runClaude / runCodex / runCursor / runGemini / runKimi / + // runOpencode all call setSessionEndReason('completed') without + // touching archiveReason) must not be archived as 'Hub restart'. + // The setSessionEndReason setter flips the default when the runner + // transitions to 'completed'. + it('setSessionEndReason("completed") flips the default reason to "Session completed"', async () => { + const session = makeFakeSession() + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + lifecycle.setSessionEndReason('completed') + await lifecycle.cleanup() + + expect(session.metadataWrites[0]).toMatchObject({ + archiveReason: 'Session completed' + }) + }) + + it('an explicit setArchiveReason before setSessionEndReason("completed") still wins', async () => { + const session = makeFakeSession() + const lifecycle = createRunnerLifecycle({ + session: session as unknown as Parameters[0]['session'], + logTag: 'test' + }) + + lifecycle.setArchiveReason('User terminated') + lifecycle.setSessionEndReason('completed') + await lifecycle.cleanup() + + expect(session.metadataWrites[0]).toMatchObject({ + archiveReason: 'User terminated' + }) + }) }) diff --git a/cli/src/agent/runnerLifecycle.ts b/cli/src/agent/runnerLifecycle.ts index 96e3aa93b0..641bdd2f72 100644 --- a/cli/src/agent/runnerLifecycle.ts +++ b/cli/src/agent/runnerLifecycle.ts @@ -115,6 +115,18 @@ export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLi const setSessionEndReason = (reason: SessionEndReason) => { sessionEndReason = reason + // tiann/hapi#914 review round 4: every agent runner + // (runClaude / runCodex / runCursor / runGemini / runKimi / + // runOpencode) calls setSessionEndReason('completed') before + // cleanupAndExit() on the natural-exit path without setting an + // archive reason. With the SIGTERM-driven default of 'Hub restart', + // clean completions would otherwise be audit-trailed as restart + // cascades. Flip the default to 'Session completed' when the end + // reason transitions to 'completed' AND no caller has already + // overridden the archive reason. + if (reason === 'completed' && archiveReason === 'Hub restart') { + archiveReason = 'Session completed' + } } const markCrash = (error: unknown) => {