-
-
Notifications
You must be signed in to change notification settings - Fork 463
fix(hub,cli): four hub-restart-cascade cleanup bugs (#913 #914 #916 #919) #923
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
928b1b6
1c8972a
123c625
a97b9dc
98b1031
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| import { describe, expect, it, vi } from 'vitest' | ||
| import { createRunnerLifecycle } from './runnerLifecycle' | ||
|
|
||
| // tiann/hapi#914: the runnerLifecycle's default archiveReason is now | ||
| // 'Hub restart' (was 'User terminated'). Out-of-band SIGTERM from the | ||
| // hub-restart cascade keeps that default. Explicit user actions | ||
| // (clicking Archive in the web UI, Ctrl-C in a local terminal, | ||
| // uncaught exception) reassign the reason before archive metadata is | ||
| // written. | ||
|
|
||
| function makeFakeSession() { | ||
| const metadataWrites: Array<Record<string, unknown>> = [] | ||
| return { | ||
| updateMetadata: vi.fn((handler: (m: Record<string, unknown>) => Record<string, unknown>) => { | ||
| const next = handler({}) | ||
| metadataWrites.push(next) | ||
| return next | ||
| }), | ||
| sendSessionDeath: vi.fn(), | ||
| flush: vi.fn(async () => {}), | ||
| close: vi.fn(async () => {}), | ||
| metadataWrites | ||
| } | ||
| } | ||
|
|
||
| describe('createRunnerLifecycle archiveReason defaults (tiann/hapi#914)', () => { | ||
| it('uses Hub restart as the default archiveReason when no override is applied', async () => { | ||
| const session = makeFakeSession() | ||
| const lifecycle = createRunnerLifecycle({ | ||
| session: session as unknown as Parameters<typeof createRunnerLifecycle>[0]['session'], | ||
| logTag: 'test' | ||
| }) | ||
|
|
||
| await lifecycle.cleanup() | ||
|
|
||
| expect(session.metadataWrites).toHaveLength(1) | ||
| expect(session.metadataWrites[0]).toMatchObject({ | ||
| lifecycleState: 'archived', | ||
| archivedBy: 'cli', | ||
| archiveReason: 'Hub restart' | ||
| }) | ||
| }) | ||
|
|
||
| it('writes the operator-supplied reason when setArchiveReason is called (e.g. KillSession RPC)', async () => { | ||
| const session = makeFakeSession() | ||
| const lifecycle = createRunnerLifecycle({ | ||
| session: session as unknown as Parameters<typeof createRunnerLifecycle>[0]['session'], | ||
| logTag: 'test' | ||
| }) | ||
|
|
||
| lifecycle.setArchiveReason('User terminated') | ||
| await lifecycle.cleanup() | ||
|
|
||
| expect(session.metadataWrites[0]).toMatchObject({ | ||
| archiveReason: 'User terminated' | ||
| }) | ||
| }) | ||
|
|
||
| it('markCrash overrides the default reason to "Session crashed"', async () => { | ||
| const session = makeFakeSession() | ||
| const lifecycle = createRunnerLifecycle({ | ||
| session: session as unknown as Parameters<typeof createRunnerLifecycle>[0]['session'], | ||
| logTag: 'test' | ||
| }) | ||
|
|
||
| lifecycle.markCrash(new Error('boom')) | ||
| await lifecycle.cleanup() | ||
|
|
||
| expect(session.metadataWrites[0]).toMatchObject({ | ||
| archiveReason: 'Session crashed' | ||
| }) | ||
| }) | ||
| }) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,16 @@ export type RunnerLifecycle = { | |
|
|
||
| export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLifecycle { | ||
| let exitCode = 0 | ||
| let archiveReason = 'User terminated' | ||
| // tiann/hapi#914: default reason is 'Hub restart' (parent-driven SIGTERM | ||
| // is the most common non-user cause). Genuine user actions (clicking | ||
| // Archive in the web UI, or Ctrl-C in a local terminal) explicitly | ||
| // reassign this via `setArchiveReason` BEFORE `cleanupAndExit` runs: | ||
| // - KillSession RPC handler → 'User terminated' (see registerKillSessionHandler) | ||
| // - SIGINT handler → 'User terminated' (Ctrl-C in local terminal) | ||
| // - uncaughtException/Reject → 'Session crashed' (via markCrash) | ||
| // Out-of-band SIGTERM (hub-restart cascade, `kill <pid>` from host) keeps | ||
| // the default and is correctly labelled 'Hub restart' on the audit trail. | ||
| let archiveReason = 'Hub restart' | ||
| let sessionEndReason: SessionEndReason = 'terminated' | ||
| let cleanupStarted = false | ||
| let cleanupPromise: Promise<void> | null = null | ||
|
|
@@ -105,11 +114,19 @@ export function createRunnerLifecycle(options: RunnerLifecycleOptions): RunnerLi | |
| } | ||
|
|
||
| const registerProcessHandlers = () => { | ||
| // tiann/hapi#914: SIGTERM is treated as the default reason ('Hub restart') | ||
| // because the runner is restarted by systemd as part of hub restart in | ||
| // production. If a future code path needs to distinguish "operator | ||
| // killed the host process" from "hub restart", it can call | ||
| // setArchiveReason() before the runner exits. | ||
| process.on('SIGTERM', () => { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Major] Runner-driven stop paths now get mislabeled as hub restarts. The new default makes every SIGTERM archive as Suggested fix: // propagate an explicit reason from runner stop/cleanup paths, e.g. via env on spawn
const archiveReason = process.env.HAPI_ARCHIVE_REASON
if (archiveReason) {
lifecycle.setArchiveReason(archiveReason)
} |
||
| void cleanupAndExit() | ||
| }) | ||
|
|
||
| // Ctrl-C in a local terminal is genuine user intent — keep the | ||
| // pre-#914 label so the audit trail still shows it. | ||
| process.on('SIGINT', () => { | ||
| archiveReason = 'User terminated' | ||
| void cleanupAndExit() | ||
| }) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| import { describe, expect, it, vi } from 'vitest' | ||
| import { RPC_METHODS } from '@hapi/protocol/rpcMethods' | ||
| import { registerKillSessionHandler } from './registerKillSessionHandler' | ||
|
|
||
| // tiann/hapi#914: the KillSession RPC is the authoritative "user-terminated" | ||
| // signal because the hub only sends it when the operator clicks Archive in | ||
| // the web UI. Out-of-band SIGTERM (hub-restart cascade, host-level `kill`) | ||
| // hits the SIGTERM signal handler in runnerLifecycle, which now keeps the | ||
| // default reason 'Hub restart' so the audit trail stays correct. | ||
| describe('registerKillSessionHandler (tiann/hapi#914)', () => { | ||
| function makeRegistry() { | ||
| const handlers = new Map<string, (params?: unknown) => unknown>() | ||
| return { | ||
| registerHandler: (method: string, handler: (params: unknown) => unknown) => { | ||
| handlers.set(method, handler as (params?: unknown) => unknown) | ||
| }, | ||
| handlers | ||
| } | ||
| } | ||
|
|
||
| it('stamps archiveReason=User terminated before triggering cleanupAndExit', async () => { | ||
| const registry = makeRegistry() | ||
| const lifecycle = { | ||
| setArchiveReason: vi.fn(), | ||
| cleanupAndExit: vi.fn(async () => {}) | ||
| } | ||
|
|
||
| registerKillSessionHandler( | ||
| registry as unknown as Parameters<typeof registerKillSessionHandler>[0], | ||
| lifecycle | ||
| ) | ||
|
|
||
| const handler = registry.handlers.get(RPC_METHODS.KillSession) | ||
| expect(handler).toBeDefined() | ||
|
|
||
| const result = await handler?.() | ||
| expect(result).toEqual({ success: true, message: 'Killing hapi CLI process' }) | ||
|
|
||
| // setArchiveReason MUST be called BEFORE cleanupAndExit so the archive | ||
| // metadata write reads the correct reason. | ||
| const setReasonOrder = lifecycle.setArchiveReason.mock.invocationCallOrder[0] | ||
| const cleanupOrder = lifecycle.cleanupAndExit.mock.invocationCallOrder[0] | ||
| expect(setReasonOrder).toBeLessThan(cleanupOrder) | ||
| expect(lifecycle.setArchiveReason).toHaveBeenCalledWith('User terminated') | ||
| expect(lifecycle.cleanupAndExit).toHaveBeenCalled() | ||
| }) | ||
|
|
||
| it('still works with the legacy `(cleanupAndExit: () => Promise<void>)` call shape', async () => { | ||
| // Back-compat: runAgentSession.ts passes a bare closure as the second | ||
| // argument instead of a lifecycle object. The handler should not crash | ||
| // when setArchiveReason is absent. | ||
| const registry = makeRegistry() | ||
| const cleanupAndExit = vi.fn(async () => {}) | ||
|
|
||
| registerKillSessionHandler( | ||
| registry as unknown as Parameters<typeof registerKillSessionHandler>[0], | ||
| cleanupAndExit | ||
| ) | ||
|
|
||
| const handler = registry.handlers.get(RPC_METHODS.KillSession) | ||
| await handler?.() | ||
|
|
||
| expect(cleanupAndExit).toHaveBeenCalled() | ||
| }) | ||
| }) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[Major] This changed default is also used by ordinary
cleanup()/cleanupAndExit()calls, not only by the SIGTERM handler. The clean completion paths in the launchers set onlysessionEndReasontocompletedbefore cleanup (for examplerunCodex.ts:377), andarchiveAndClosepersists whateverarchiveReasoncurrently holds. Result: successful agent exits now get archived witharchiveReason: 'Hub restart', which pollutes the audit trail this PR is trying to correct.Suggested fix: