diff --git a/packages/engine/src/config.ts b/packages/engine/src/config.ts index e855af8d7..ab0871ad2 100644 --- a/packages/engine/src/config.ts +++ b/packages/engine/src/config.ts @@ -94,7 +94,12 @@ export interface EngineConfig { ffmpegEncodeTimeout: number; /** Timeout for FFmpeg mux/faststart processes (ms). Default: 300_000 */ ffmpegProcessTimeout: number; - /** Timeout for FFmpeg streaming encode (ms). Default: 600_000 */ + /** + * Inactivity timeout for FFmpeg streaming encode (ms). The timer resets on + * every successful `writeFrame` call, so this caps the duration of a + * single "no frame arrived" gap (capture hang, dead Chrome), not the total + * render time. Default: 600_000 (10 minutes without any frame = dead). + */ ffmpegStreamingTimeout: number; // ── HDR ────────────────────────────────────────────────────────────── diff --git a/packages/engine/src/services/streamingEncoder.test.ts b/packages/engine/src/services/streamingEncoder.test.ts index cfcb6774c..70fd95ea5 100644 --- a/packages/engine/src/services/streamingEncoder.test.ts +++ b/packages/engine/src/services/streamingEncoder.test.ts @@ -572,4 +572,71 @@ describe("spawnStreamingEncoder lifecycle and cleanup", () => { controller.abort(); expect(proc.kill).not.toHaveBeenCalled(); }); + + it("inactivity timeout fires only after a no-frame gap exceeds ffmpegStreamingTimeout", async () => { + vi.useFakeTimers(); + try { + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { spawnStreamingEncoder } = await import("./streamingEncoder.js"); + const dir = mkdtempSync(join(tmpdir(), "se-heartbeat-")); + const encoder = await spawnStreamingEncoder(join(dir, "out.mp4"), baseOptions, undefined, { + ffmpegStreamingTimeout: 1000, + }); + + const proc = calls[0]!.proc; + + // Frames every 900ms — under the 1000ms inactivity threshold — should + // keep resetting the timer. After 9× 900ms = 8.1s of "slow but + // progressing" capture the encoder must still be alive. The old total- + // render timeout would have fired SIGTERM at ~1000ms. + for (let i = 0; i < 9; i++) { + encoder.writeFrame(Buffer.from([i])); + vi.advanceTimersByTime(900); + } + expect(proc.kill).not.toHaveBeenCalled(); + + // Now stall — no writeFrame for longer than the threshold. SIGTERM fires. + vi.advanceTimersByTime(1100); + expect(proc.kill).toHaveBeenCalledWith("SIGTERM"); + } finally { + vi.useRealTimers(); + } + }); + + it("inactivity timeout still fires when stdin is backpressured (stalled ffmpeg, live producer)", async () => { + vi.useFakeTimers(); + try { + // Simulate the FFmpeg-hangs-but-Chrome-keeps-producing case: stdin.write + // always returns false (Node has to buffer because ffmpeg isn't draining + // the pipe). The heartbeat must NOT reset on those buffered writes — + // otherwise a hung ffmpeg with a steady frame producer would never + // SIGTERM and we'd grow Node's stdin buffer until OOM. + const { spawn, calls } = createSpawnSpy(); + vi.resetModules(); + vi.doMock("child_process", () => ({ spawn })); + + const { spawnStreamingEncoder } = await import("./streamingEncoder.js"); + const dir = mkdtempSync(join(tmpdir(), "se-backpressure-")); + const encoder = await spawnStreamingEncoder(join(dir, "out.mp4"), baseOptions, undefined, { + ffmpegStreamingTimeout: 1000, + }); + + const proc = calls[0]!.proc; + proc.stdin.write = (_chunk: Buffer) => false; + + // Pump 9 frames at 900ms intervals — all returning false. The reset + // should NOT fire (every write was buffered, not accepted), so the + // 1000ms timer (last reset on spawn) elapses near the start. + for (let i = 0; i < 9; i++) { + encoder.writeFrame(Buffer.from([i])); + vi.advanceTimersByTime(900); + } + expect(proc.kill).toHaveBeenCalledWith("SIGTERM"); + } finally { + vi.useRealTimers(); + } + }); }); diff --git a/packages/engine/src/services/streamingEncoder.ts b/packages/engine/src/services/streamingEncoder.ts index a31edeb01..9f2bcf81e 100644 --- a/packages/engine/src/services/streamingEncoder.ts +++ b/packages/engine/src/services/streamingEncoder.ts @@ -416,13 +416,26 @@ export async function spawnStreamingEncoder( } } - // Timeout safety + // Inactivity timeout: fires only when no frame has been written for + // `ffmpegStreamingTimeout` ms. A slow-but-progressing capture (e.g. a CI + // runner under load) keeps resetting the timer on each writeFrame, so total + // wall-clock render time is unbounded — only a true hang (Chrome dead, + // capture stuck, no frames arriving) trips SIGTERM. The 600s default was + // previously a total-render cap, which intermittently killed legitimate + // slow renders mid-encode (FFmpeg got SIGTERM after most frames were sent; + // libx264 printed its summary and exited 255, observable as + // "Streaming encode failed: FFmpeg exited with code 255" with audio:0kB). const streamingTimeout = config?.ffmpegStreamingTimeout ?? DEFAULT_CONFIG.ffmpegStreamingTimeout; - const timer = setTimeout(() => { - if (exitStatus === "running") { - ffmpeg.kill("SIGTERM"); - } - }, streamingTimeout); + let timer: NodeJS.Timeout | null = null; + const resetTimer = () => { + if (timer) clearTimeout(timer); + timer = setTimeout(() => { + if (exitStatus === "running") { + ffmpeg.kill("SIGTERM"); + } + }, streamingTimeout); + }; + resetTimer(); const encoder: StreamingEncoder = { writeFrame: (buffer: Buffer): boolean => { @@ -433,11 +446,20 @@ export async function spawnStreamingEncoder( // provided buffer and drain it asynchronously. The HDR path's compositor // reuses pre-allocated transOutput/normalCanvas buffers across frames, // so without this copy the pipe would read partially-overwritten data - // and flicker. The SDR path doesn't invoke writeFrame at all (it pipes - // PNG files via encodeFramesFromDir), so the memcpy here is HDR-only - // and justified by correctness. + // and flicker. const copy = Buffer.from(buffer); - return ffmpeg.stdin.write(copy); + const accepted = ffmpeg.stdin.write(copy); + // Reset inactivity timer ONLY on `accepted === true`. `true` means the + // write went through to the kernel pipe without buffering in Node — + // proof FFmpeg is actually consuming. `false` means Node's writable + // stream had to buffer (FFmpeg hasn't drained the pipe yet); we deliberately + // don't reset on `false` so a hung FFmpeg with a still-producing Chrome + // can't keep us alive forever while Node's stdin buffer grows to OOM. In + // steady state with a slower-but-alive FFmpeg, writes alternate between + // true and false as the buffer drains and refills; the trues are enough + // to keep the heartbeat ticking. + if (accepted) resetTimer(); + return accepted; }, close: async (): Promise => { @@ -455,7 +477,10 @@ export async function spawnStreamingEncoder( // repeated calls. If you change this method, preserve idempotency or // a regression here will silently double-close ffmpeg and produce // harder-to-trace errors at the orchestrator layer. - clearTimeout(timer); + if (timer) { + clearTimeout(timer); + timer = null; + } if (signal) signal.removeEventListener("abort", onAbort); const stdin = ffmpeg.stdin;