Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions src/feedback-loops.ts
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,14 @@ export function extractErrorSubtype(errorMsg: string): string {
if (lower.includes('reached maximum budget')) return 'budget_exceeded';
if (lower.includes('aborted by user')) return 'user_abort';
// Issue #491: Chinese-localized upstream 400 errors — split from transient_fast_band.
// These fire on attempt 1/3 (no prior call for any window to throttle), dur<20s, with
// "exit N/A" in the message (Anthropic server rejected the request, not a CLI hang).
// Must be extracted BEFORE the transient_fast_band branch so they get their own subtype
// and stop polluting the circuit-breaker metric with structurally-ungatable events.
// Discriminator (per issue spec): Chinese localized message + dur<20s + exit N/A.
// The `attempt 1/3` example in the issue was the smoking-gun illustration of WHY
// the caller-side gate can't help (no prior call to throttle), NOT part of the
// discriminator. Even attempt 3/3 fires with `exit N/A` are upstream rejects —
// the upstream returned a localized 400 and there's nothing the breaker can do.
// Must be extracted BEFORE the transient_fast_band branch so they get their own
// subtype and stop polluting the circuit-breaker metric with structurally-
// ungatable events.
if (
(/處理訊息時發生錯誤|請稍後再試/.test(errorMsg)) &&
/exit n\/a/i.test(errorMsg)
Expand Down Expand Up @@ -285,16 +289,11 @@ export function extractErrorSubtype(errorMsg: string): string {
// the bucket never silently disappears from the recurring-errors panel.
if (durMatch) {
const dur = Number(durMatch[1]);
// Issue #491: split off upstream_quickreject_cn from transient_fast_band.
// Signature: 處理訊息時發生錯誤 + dur<20s + attempt 1/N — fires on the FIRST
// call before any prior call exists for the caller-side fast-band gate
// (#443/#446 damper) to throttle against. Different remediation lane
// (back-off seconds-to-minutes vs. classic transient retry); keeping it
// mixed into transient_fast_band pollutes circuit-breaker telemetry and
// spawns false [REGRESSION] tasks for upstream events the breaker can't
// influence. Attempt 2-3/N retry-storm fires stay as transient_fast_band
// (genuine fast-band the damper applies to).
if (dur < 20 && /attempt 1\//.test(lower)) return 'upstream_quickreject_cn';
// Issue #491: upstream_quickreject_cn is handled at the EARLIER branch above
// (~line 250) gated on `exit N/A`. The previous attempt to gate here on
// `attempt 1/` was over-constrained — see issue #491 spec: the discriminator
// is the localized message + exit N/A, not the attempt number. Removed
// 2026-05-10 to resolve PR #492 / PR #493 disagreement (split-brain merge).
if (dur < 10) return 'transient_fast_band';
if (dur < 60) return 'transient_slow_band';
}
Expand Down
29 changes: 18 additions & 11 deletions tests/extract-error-subtype.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,36 +37,43 @@ describe('extractErrorSubtype — transient band split (#318)', () => {
});
});

describe('extractErrorSubtype — upstream_quickreject_cn split (#491)', () => {
describe('extractErrorSubtype — upstream_quickreject_cn split (#491) — full message shape', () => {
// Real fire format observed in error-patterns.json (UNKNOWN:transient_fast_band::callClaude):
// "claude CLI UNKNOWN (exit N/A, 8100ms this attempt, 8100ms total, attempt 1/3,
// prompt 36402 chars, loop lane): 處理訊息時發生錯誤 [dur=8s]。請稍後再試..."
// Discriminator: attempt 1/N + dur<20s + 處理訊息時發生錯誤 → upstream rejects
// before the caller-side fast-band gate has any prior call to throttle against.
// Discriminator (per issue #491 spec): localized message + dur<20s + exit N/A.
// Attempt number is NOT part of the discriminator — even attempt 3/3 with exit N/A
// is an upstream reject the caller-side gate cannot prevent. See post-merge
// reconciliation 2026-05-10: PR #492 (correct) shipped without attempt gate;
// PR #493 added one and was over-constrained. This file used to encode the
// wrong rule; tests are updated to match the spec.
const mkUpstreamMsg = (durSec: number, attempt: number, totalAttempts = 3) =>
`claude CLI UNKNOWN (exit N/A, ${durSec * 1000}ms this attempt, ${durSec * 1000}ms total, ` +
`attempt ${attempt}/${totalAttempts}, prompt 36402 chars, loop lane): 處理訊息時發生錯誤 [dur=${durSec}s]。請稍後再試。`;

it('classifies attempt 1/N + dur<20s as upstream_quickreject_cn', () => {
it('classifies attempt 1/N + dur<20s + exit N/A as upstream_quickreject_cn', () => {
expect(extractErrorSubtype(mkUpstreamMsg(8, 1))).toBe('upstream_quickreject_cn');
expect(extractErrorSubtype(mkUpstreamMsg(1, 1))).toBe('upstream_quickreject_cn');
expect(extractErrorSubtype(mkUpstreamMsg(19, 1))).toBe('upstream_quickreject_cn');
});

it('keeps attempt 2-3/N retry-storm fires as transient_fast_band (existing #318 path)', () => {
// Genuine retry-storm: caller has prior call to gate against; #443/#446 damper applies.
expect(extractErrorSubtype(mkUpstreamMsg(8, 2))).toBe('transient_fast_band');
expect(extractErrorSubtype(mkUpstreamMsg(8, 3))).toBe('transient_fast_band');
it('classifies attempt 2-3/N + exit N/A as upstream_quickreject_cn (spec: discriminator is exit N/A, not attempt)', () => {
expect(extractErrorSubtype(mkUpstreamMsg(8, 2))).toBe('upstream_quickreject_cn');
expect(extractErrorSubtype(mkUpstreamMsg(8, 3))).toBe('upstream_quickreject_cn');
});

it('does not over-match attempt 1 with dur>=20s (slow-band stays transient_slow_band)', () => {
// dur=20s should fall through to transient_slow_band, not upstream_quickreject_cn.
it('does not over-match dur>=20s + exit N/A (falls through to transient_slow_band)', () => {
expect(extractErrorSubtype(mkUpstreamMsg(20, 1))).toBe('transient_slow_band');
expect(extractErrorSubtype(mkUpstreamMsg(59, 1))).toBe('transient_slow_band');
});

it('keeps genuine retry-storm WITHOUT exit N/A as transient_fast_band', () => {
// No exit N/A → real CLI/transient failure, not an upstream reject.
const msg = '處理訊息時發生錯誤 attempt 3/3, prompt 26866 chars, dur=8s';
expect(extractErrorSubtype(msg)).toBe('transient_fast_band');
});

it('preserves no_diag fallback when 處理訊息時發生錯誤 has no dur= suffix', () => {
// Sanity: existing fallback path untouched even with attempt 1/N present.
const msg = 'attempt 1/3 處理訊息時發生錯誤 some text without duration';
expect(extractErrorSubtype(msg)).toBe('no_diag');
});
Expand Down