Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions cli/src/agent/runnerLifecycle.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { describe, expect, it, vi } from 'vitest'
import { createRunnerLifecycle } from './runnerLifecycle'

// tiann/hapi#914: the runnerLifecycle's default archiveReason is now
// 'Hub restart' (was 'User terminated'). Out-of-band SIGTERM from the
// hub-restart cascade keeps that default. Explicit user actions
// (clicking Archive in the web UI, Ctrl-C in a local terminal,
// uncaught exception) reassign the reason before archive metadata is
// written.

function makeFakeSession() {
const metadataWrites: Array<Record<string, unknown>> = []
return {
updateMetadata: vi.fn((handler: (m: Record<string, unknown>) => Record<string, unknown>) => {
const next = handler({})
metadataWrites.push(next)
return next
}),
sendSessionDeath: vi.fn(),
flush: vi.fn(async () => {}),
close: vi.fn(async () => {}),
metadataWrites
}
}

describe('createRunnerLifecycle archiveReason defaults (tiann/hapi#914)', () => {
it('uses Hub restart as the default archiveReason when no override is applied', async () => {
const session = makeFakeSession()
const lifecycle = createRunnerLifecycle({
session: session as unknown as Parameters<typeof createRunnerLifecycle>[0]['session'],
logTag: 'test'
})

await lifecycle.cleanup()

expect(session.metadataWrites).toHaveLength(1)
expect(session.metadataWrites[0]).toMatchObject({
lifecycleState: 'archived',
archivedBy: 'cli',
archiveReason: 'Hub restart'
})
})

it('writes the operator-supplied reason when setArchiveReason is called (e.g. KillSession RPC)', async () => {
const session = makeFakeSession()
const lifecycle = createRunnerLifecycle({
session: session as unknown as Parameters<typeof createRunnerLifecycle>[0]['session'],
logTag: 'test'
})

lifecycle.setArchiveReason('User terminated')
await lifecycle.cleanup()

expect(session.metadataWrites[0]).toMatchObject({
archiveReason: 'User terminated'
})
})

it('markCrash overrides the default reason to "Session crashed"', async () => {
const session = makeFakeSession()
const lifecycle = createRunnerLifecycle({
session: session as unknown as Parameters<typeof createRunnerLifecycle>[0]['session'],
logTag: 'test'
})

lifecycle.markCrash(new Error('boom'))
await lifecycle.cleanup()

expect(session.metadataWrites[0]).toMatchObject({
archiveReason: 'Session crashed'
})
})
})
19 changes: 18 additions & 1 deletion cli/src/agent/runnerLifecycle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@ export type RunnerLifecycle = {

export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLifecycle {
let exitCode = 0
let archiveReason = 'User terminated'
// tiann/hapi#914: default reason is 'Hub restart' (parent-driven SIGTERM
// is the most common non-user cause). Genuine user actions (clicking
// Archive in the web UI, or Ctrl-C in a local terminal) explicitly
// reassign this via `setArchiveReason` BEFORE `cleanupAndExit` runs:
// - KillSession RPC handler → 'User terminated' (see registerKillSessionHandler)
// - SIGINT handler → 'User terminated' (Ctrl-C in local terminal)
// - uncaughtException/Reject → 'Session crashed' (via markCrash)
// Out-of-band SIGTERM (hub-restart cascade, `kill <pid>` from host) keeps
// the default and is correctly labelled 'Hub restart' on the audit trail.
let archiveReason = 'Hub restart'

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Major] This changed default is also used by ordinary cleanup() / cleanupAndExit() calls, not only by the SIGTERM handler. The clean completion paths in the launchers set only sessionEndReason to completed before cleanup (for example runCodex.ts:377), and archiveAndClose persists whatever archiveReason currently holds. Result: successful agent exits now get archived with archiveReason: 'Hub restart', which pollutes the audit trail this PR is trying to correct.

Suggested fix:

const setSessionEndReason = (reason: SessionEndReason) => {
    sessionEndReason = reason
    if (reason === 'completed' && archiveReason === 'Hub restart') {
        archiveReason = 'Session completed'
    }
}

let sessionEndReason: SessionEndReason = 'terminated'
let cleanupStarted = false
let cleanupPromise: Promise<void> | null = null
Expand Down Expand Up @@ -105,11 +114,19 @@ export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLi
}

const registerProcessHandlers = () => {
// tiann/hapi#914: SIGTERM is treated as the default reason ('Hub restart')
// because the runner is restarted by systemd as part of hub restart in
// production. If a future code path needs to distinguish "operator
// killed the host process" from "hub restart", it can call
// setArchiveReason() before the runner exits.
process.on('SIGTERM', () => {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Major] Runner-driven stop paths now get mislabeled as hub restarts. The new default makes every SIGTERM archive as Hub restart unless a caller stamps a different reason first. Web Archive now does that through registerKillSessionHandler, but hapi runner stop-session, runner webhook timeout cleanup, and orphan cleanup terminate child sessions with SIGTERM directly (cli/src/runner/run.ts:267, cli/src/runner/run.ts:587). Those are operator/runner actions, not hub restarts, yet this SIGTERM handler keeps the new default, so archived metadata becomes misleading and the audit-trail fix regresses another supported termination path.

Suggested fix:

// propagate an explicit reason from runner stop/cleanup paths, e.g. via env on spawn
const archiveReason = process.env.HAPI_ARCHIVE_REASON
if (archiveReason) {
    lifecycle.setArchiveReason(archiveReason)
}

void cleanupAndExit()
})

// Ctrl-C in a local terminal is genuine user intent — keep the
// pre-#914 label so the audit trail still shows it.
process.on('SIGINT', () => {
archiveReason = 'User terminated'
void cleanupAndExit()
})

Expand Down
15 changes: 15 additions & 0 deletions cli/src/api/apiSession.ts
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,21 @@ export class ApiSessionClient extends EventEmitter {
})
}

/**
* tiann/hapi#913: wait until any pending `update-metadata` writes have
* been acked by the hub (or the timeout elapses). `updateMetadata` is
* fire-and-forget at the call site because it's invoked on the hot path
* for every turn; this helper lets the few callers who actually need
* durability — fresh ACP session-id pre-registration is the canonical
* case — synchronously gate on persistence without changing every
* caller's signature.
*
* Returns true when the lock drained, false when the timeout fired.
*/
async flushMetadata(timeoutMs: number = 5_000): Promise<boolean> {
return await this.drainLock(this.metadataLock, timeoutMs)
}

async flush(options?: { timeoutMs?: number }): Promise<void> {
const deadlineMs = Date.now() + (options?.timeoutMs ?? 5_000)

Expand Down
65 changes: 65 additions & 0 deletions cli/src/claude/registerKillSessionHandler.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { describe, expect, it, vi } from 'vitest'
import { RPC_METHODS } from '@hapi/protocol/rpcMethods'
import { registerKillSessionHandler } from './registerKillSessionHandler'

// tiann/hapi#914: the KillSession RPC is the authoritative "user-terminated"
// signal because the hub only sends it when the operator clicks Archive in
// the web UI. Out-of-band SIGTERM (hub-restart cascade, host-level `kill`)
// hits the SIGTERM signal handler in runnerLifecycle, which now keeps the
// default reason 'Hub restart' so the audit trail stays correct.
describe('registerKillSessionHandler (tiann/hapi#914)', () => {
function makeRegistry() {
const handlers = new Map<string, (params?: unknown) => unknown>()
return {
registerHandler: (method: string, handler: (params: unknown) => unknown) => {
handlers.set(method, handler as (params?: unknown) => unknown)
},
handlers
}
}

it('stamps archiveReason=User terminated before triggering cleanupAndExit', async () => {
const registry = makeRegistry()
const lifecycle = {
setArchiveReason: vi.fn(),
cleanupAndExit: vi.fn(async () => {})
}

registerKillSessionHandler(
registry as unknown as Parameters<typeof registerKillSessionHandler>[0],
lifecycle
)

const handler = registry.handlers.get(RPC_METHODS.KillSession)
expect(handler).toBeDefined()

const result = await handler?.()
expect(result).toEqual({ success: true, message: 'Killing hapi CLI process' })

// setArchiveReason MUST be called BEFORE cleanupAndExit so the archive
// metadata write reads the correct reason.
const setReasonOrder = lifecycle.setArchiveReason.mock.invocationCallOrder[0]
const cleanupOrder = lifecycle.cleanupAndExit.mock.invocationCallOrder[0]
expect(setReasonOrder).toBeLessThan(cleanupOrder)
expect(lifecycle.setArchiveReason).toHaveBeenCalledWith('User terminated')
expect(lifecycle.cleanupAndExit).toHaveBeenCalled()
})

it('still works with the legacy `(cleanupAndExit: () => Promise<void>)` call shape', async () => {
// Back-compat: runAgentSession.ts passes a bare closure as the second
// argument instead of a lifecycle object. The handler should not crash
// when setArchiveReason is absent.
const registry = makeRegistry()
const cleanupAndExit = vi.fn(async () => {})

registerKillSessionHandler(
registry as unknown as Parameters<typeof registerKillSessionHandler>[0],
cleanupAndExit
)

const handler = registry.handlers.get(RPC_METHODS.KillSession)
await handler?.()

expect(cleanupAndExit).toHaveBeenCalled()
})
})
29 changes: 26 additions & 3 deletions cli/src/claude/registerKillSessionHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,41 @@ interface KillSessionResponse {
message: string;
}

/**
* tiann/hapi#914: callers can pass either a bare `cleanupAndExit` closure
* (legacy) or an options object that lets the kill-RPC stamp an explicit
* `archiveReason` before the lifecycle teardown runs. The hub only sends
* KillSession when the operator clicked Archive in the UI, so this RPC is
* the authoritative "user-terminated" signal; out-of-band SIGTERM from a
* hub-restart cascade no longer collides with the default archive reason.
*/
export interface KillSessionLifecycle {
cleanupAndExit: () => Promise<void>;
setArchiveReason?: (reason: string) => void;
}

export function registerKillSessionHandler(
rpcHandlerManager: RpcHandlerManager,
killThisHappy: () => Promise<void>
lifecycleOrCleanup: KillSessionLifecycle | (() => Promise<void>)
) {
const lifecycle: KillSessionLifecycle = typeof lifecycleOrCleanup === 'function'
? { cleanupAndExit: lifecycleOrCleanup }
: lifecycleOrCleanup;

rpcHandlerManager.registerHandler<KillSessionRequest, KillSessionResponse>(RPC_METHODS.KillSession, async () => {
logger.debug('Kill session request received');

// tiann/hapi#914: stamp the archive reason from the RPC path so the
// default in `runnerLifecycle.ts` can be reassigned away from
// 'User terminated'. A hub-restart-cascade SIGTERM does NOT go
// through this handler — it hits the SIGTERM signal handler — so
// those archives now stay labelled `'Hub restart'` (the new default).
lifecycle.setArchiveReason?.('User terminated');

// This will start the cleanup process
void killThisHappy();
void lifecycle.cleanupAndExit();

// We should still be able to respond the the client, though they
// We should still be able to respond to the client, though they
// should optimistically assume the session is dead.
return {
success: true,
Expand Down
2 changes: 1 addition & 1 deletion cli/src/claude/runClaude.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ export async function runClaude(options: StartOptions = {}): Promise<void> {
});

lifecycle.registerProcessHandlers();
registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit);
registerKillSessionHandler(session.rpcHandlerManager, lifecycle);
registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle);

// Set initial agent state
Expand Down
2 changes: 1 addition & 1 deletion cli/src/codex/runCodex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ export async function runCodex(opts: {
});

lifecycle.registerProcessHandlers();
registerKillSessionHandler(session.rpcHandlerManager, lifecycle.cleanupAndExit);
registerKillSessionHandler(session.rpcHandlerManager, lifecycle);
registerLocalHandoffHandler(session.rpcHandlerManager, lifecycle);

const applyCurrentConfigToSession = (options?: { syncModel?: boolean }) => {
Expand Down
Loading
Loading