Skip to content

Commit 6cd3bab

Browse files
committed
fix: standardize empty result acceptance across retry policies and network configurations
1 parent 5186a14 commit 6cd3bab

14 files changed

Lines changed: 325 additions & 69 deletions

architecture/evm/hooks_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_ListedMethods(t *testing.T) {
4141
}
4242

4343
// Create a test network with the default methods configured
44-
network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)
44+
network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())
4545

4646
for _, m := range methods {
4747
// Build a minimal request with method m
@@ -82,7 +82,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_RetryEmptyFalse(t *testing.T) {
8282
}
8383

8484
// Create a test network with the default methods configured
85-
network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)
85+
network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())
8686

8787
for _, m := range methods {
8888
// Build a minimal request with method m
@@ -125,7 +125,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_NonListedMethods(t *testing.T) {
125125
}
126126

127127
// Create a test network with the default methods configured
128-
network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)
128+
network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())
129129

130130
for _, m := range methods {
131131
// Build a minimal request with method m

common/defaults.go

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,29 +1908,36 @@ func (n *NetworkConfig) SetDefaults(upstreams []*UpstreamConfig, defaults *Netwo
19081908
const DefaultEvmFinalityDepth = 1024
19091909
const DefaultEvmStatePollerDebounce = Duration(5 * time.Second)
19101910

1911-
// DefaultMarkEmptyAsErrorMethods lists the default methods for which empty/null results
1912-
// should be treated as "missing data" errors, triggering retry on other upstreams.
1913-
// Note: eth_getBlockByHash is intentionally excluded because subgraph-based upstreams
1914-
// commonly return empty for this method, which is expected behavior.
1915-
// Note: eth_getTransactionReceipt is excluded as a quick remedy. Ideally we'd only allow null
1916-
// for pending txs (historical txs should retry on other upstreams). The "retry on empty" directive
1917-
// can still be used since some nodes may already have the receipt.
1918-
var DefaultMarkEmptyAsErrorMethods = []string{
1919-
// Block lookups (eth_getBlockByHash excluded - subgraphs return empty for it)
1920-
"eth_getBlockByNumber",
1921-
"eth_getBlockReceipts",
1922-
// Transaction lookups (eth_getTransactionReceipt excluded - see note above)
1923-
"eth_getTransactionByHash",
1924-
"eth_getTransactionByBlockHashAndIndex",
1925-
"eth_getTransactionByBlockNumberAndIndex",
1926-
// Uncle/ommers (legacy API)
1927-
"eth_getUncleByBlockHashAndIndex",
1928-
"eth_getUncleByBlockNumberAndIndex",
1929-
// Traces (debug/trace/parity modules)
1930-
"debug_traceTransaction",
1931-
"trace_transaction",
1932-
"trace_block",
1933-
"trace_get",
1911+
// DefaultEmptyResultAccept returns a fresh copy of the methods for which an
1912+
// empty/null result is considered valid (e.g. eth_getLogs, eth_call). A new
1913+
// slice is returned on every call so callers cannot mutate the shared default.
1914+
func DefaultEmptyResultAccept() []string {
1915+
return []string{"eth_getLogs", "eth_call"}
1916+
}
1917+
1918+
// DefaultMarkEmptyAsErrorMethods returns a fresh copy of the methods for which
1919+
// empty/null results should be treated as "missing data" errors, triggering retry
1920+
// on other upstreams. A new slice is returned on every call so callers cannot
1921+
// mutate the shared default.
1922+
//
1923+
// Note: eth_getBlockByHash is intentionally excluded because subgraph-based
1924+
// upstreams commonly return empty for this method, which is expected behavior.
1925+
// Note: eth_getTransactionReceipt is excluded as a quick remedy. Ideally we'd
1926+
// only allow null for pending txs.
1927+
func DefaultMarkEmptyAsErrorMethods() []string {
1928+
return []string{
1929+
"eth_getBlockByNumber",
1930+
"eth_getBlockReceipts",
1931+
"eth_getTransactionByHash",
1932+
"eth_getTransactionByBlockHashAndIndex",
1933+
"eth_getTransactionByBlockNumberAndIndex",
1934+
"eth_getUncleByBlockHashAndIndex",
1935+
"eth_getUncleByBlockNumberAndIndex",
1936+
"debug_traceTransaction",
1937+
"trace_transaction",
1938+
"trace_block",
1939+
"trace_get",
1940+
}
19341941
}
19351942

19361943
func (e *EvmNetworkConfig) SetDefaults() error {
@@ -1960,7 +1967,7 @@ func (e *EvmNetworkConfig) SetDefaults() error {
19601967

19611968
// Default methods for marking empty results as errors
19621969
if e.MarkEmptyAsErrorMethods == nil {
1963-
e.MarkEmptyAsErrorMethods = DefaultMarkEmptyAsErrorMethods
1970+
e.MarkEmptyAsErrorMethods = DefaultMarkEmptyAsErrorMethods()
19641971
}
19651972

19661973
return nil
@@ -2122,6 +2129,9 @@ func (r *RetryPolicyConfig) SetDefaults(defaults *RetryPolicyConfig) error {
21222129
r.EmptyResultAccept = defaults.EmptyResultIgnore
21232130
}
21242131
}
2132+
if r.EmptyResultAccept == nil {
2133+
r.EmptyResultAccept = DefaultEmptyResultAccept()
2134+
}
21252135

21262136
// Default EmptyResultMaxAttempts to MaxAttempts if not set
21272137
if r.EmptyResultMaxAttempts == 0 {

common/defaults_test.go

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,12 @@ func TestSetDefaults_NetworkConfig(t *testing.T) {
104104
assert.EqualValues(t, &FailsafeConfig{
105105
MatchMethod: "*",
106106
Retry: &RetryPolicyConfig{
107-
MaxAttempts: 12345,
108-
Delay: Duration(0 * time.Millisecond),
109-
BackoffMaxDelay: Duration(3 * time.Second),
110-
BackoffFactor: 1.2,
111-
Jitter: Duration(0 * time.Millisecond),
107+
MaxAttempts: 12345,
108+
Delay: Duration(0 * time.Millisecond),
109+
BackoffMaxDelay: Duration(3 * time.Second),
110+
BackoffFactor: 1.2,
111+
Jitter: Duration(0 * time.Millisecond),
112+
EmptyResultAccept: DefaultEmptyResultAccept(),
112113
},
113114
}, network.Failsafe[0])
114115
assert.Nil(t, network.Failsafe[0].Timeout)
@@ -359,11 +360,12 @@ func TestSetDefaults_UpstreamConfig(t *testing.T) {
359360
// Verify failsafe retry is only applied to the first upstream
360361
retry := cfg.Projects[0].Upstreams[0].Failsafe[0].Retry
361362
assert.EqualValues(t, &RetryPolicyConfig{
362-
MaxAttempts: 2,
363-
BackoffMaxDelay: Duration(10 * time.Second),
364-
Delay: Duration(1 * time.Second),
365-
Jitter: Duration(500 * time.Millisecond),
366-
BackoffFactor: 1.2,
363+
MaxAttempts: 2,
364+
BackoffMaxDelay: Duration(10 * time.Second),
365+
Delay: Duration(1 * time.Second),
366+
Jitter: Duration(500 * time.Millisecond),
367+
BackoffFactor: 1.2,
368+
EmptyResultAccept: DefaultEmptyResultAccept(),
367369
}, retry, "Retry policy should match expected values")
368370

369371
assert.Nil(t, cfg.Projects[0].Upstreams[0].Failsafe[0].CircuitBreaker, "Circuit breaker should be nil because this upstream has failsafe defined")

docs/hedge-cancel-on-error.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Hedge: "Don't cancel on first error" exploration
2+
3+
## Current behavior
4+
5+
- **CancelIf** in the hedge policy cancels other hedges when:
6+
- This execution returns **any** non-exhaustion error, or
7+
- This execution returns an **accepted** result (non-empty, or empty but in `emptyResultAccept` and not consensus).
8+
- We explicitly **do not** cancel on `ErrUpstreamsExhausted` / `ErrCodeNoUpstreamsLeftToSelect` so other hedges can still finish.
9+
10+
So: "first to return (result or error) wins" — we cancel as soon as one execution returns, unless that return is exhaustion.
11+
12+
## Why "don't cancel on first error" was not the default
13+
14+
1. **Latency when all upstreams fail**
15+
If we only cancelled on success, then when every hedge fails we would wait for the **slowest** execution instead of returning as soon as the first one fails. Example: Alchemy errors in 100ms, QuickNode in 3s → today we return in ~100ms; if we stopped cancelling on error we’d wait ~3s for the same outcome.
16+
17+
2. **Original mental model**
18+
With a single execution (no hedge), "first response" is the only response. Adding hedge kept the same idea: "first response (success or failure) wins" to avoid extra wait when the first response is already a failure.
19+
20+
3. **Loop-over-upstreams**
21+
Each execution runs a **loop** over upstreams (`maxLoopIterations` = 1 in consensus, `UpstreamsCount()` otherwise). With hedge, executions **share** `ConsumedUpstreams`, so:
22+
- Execution 1 often gets upstream A, execution 2 gets B.
23+
- Execution 1 can return after **one** upstream (e.g. error from A, then next `NextUpstream` hits duplicate or exhausted and the loop breaks).
24+
- So "first return" is often "first error from one upstream", not "tried everything". Letting that first error cancel the other hedge (e.g. QuickNode) is what causes the trace_filter case: Alchemy errors fast, we cancel QuickNode which would have succeeded in 2–3s.
25+
26+
So the loop doesn’t remove the need for "don’t cancel on first error"; it’s what makes the current "cancel on any return" strict — one execution can exit quickly with an error and kill the other.
27+
28+
## Side effects of "don’t cancel on first error"
29+
30+
| Scenario | Current (cancel on any return) | Don’t cancel on error |
31+
|--------|--------------------------------|------------------------|
32+
| First returns **success** | Other hedges cancelled ✓ | Same ✓ |
33+
| First returns **error**, another would succeed | Other hedges cancelled ✗ (e.g. QuickNode discarded) | Other can complete ✓ |
34+
| All return **errors** | Return after first error (low latency) ✓ | Wait for slowest (higher latency) ✗ |
35+
| First returns **client/execution** error (same everywhere) | Other hedges cancelled, we return quickly ✓ | We’d still wait for others for no benefit ✗ |
36+
37+
So a good refinement is: **cancel only on terminal errors**, not on every error.
38+
39+
- **Terminal errors** (cancel others): same on every upstream, no need to wait for more.
40+
- `IsClientError(err)` (bad request, range exceeded, etc.)
41+
- `ErrCodeEndpointExecutionException` (e.g. revert)
42+
- **Non-terminal errors** (don’t cancel): method not supported, 5xx, timeout, missing data, etc. — another hedge might succeed.
43+
44+
## Recommended refinement
45+
46+
In `CancelIf`, instead of:
47+
48+
```go
49+
if err != nil {
50+
return true // cancel on any error
51+
}
52+
```
53+
54+
use:
55+
56+
```go
57+
if err != nil {
58+
// Cancel only on terminal errors; let other hedges complete on transient/upstream-specific errors.
59+
if common.IsClientError(err) || common.HasErrorCode(err, common.ErrCodeEndpointExecutionException) {
60+
return true
61+
}
62+
return false
63+
}
64+
```
65+
66+
Effects:
67+
68+
- **trace_filter**: Alchemy returns "method not supported" → we don’t cancel → QuickNode can return success (or its own error) → we use the first success or aggregate failures.
69+
- **All fail**: We wait for all hedges to finish, then failsafe/retry sees the errors. Latency is max of hedge durations instead of min; acceptable if we prefer success when any single hedge can succeed.
70+
- **Client/revert**: We still cancel on first terminal error and return quickly.
71+
72+
## Summary
73+
74+
- We didn’t avoid "don’t cancel on first error" because of the loop; the loop is why one execution often returns quickly with one upstream’s error and then we cancel the rest.
75+
- Full "don’t cancel on error" would fix the trace_filter case but worsen latency when every hedge fails.
76+
- **Cancel only on terminal error** (client + execution exception) keeps latency good for deterministic failures and lets other hedges complete when the first failure is upstream-specific (e.g. method not supported).

erpc/networks.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,7 @@ func (n *Network) Forward(ctx context.Context, req *common.NormalizedRequest) (*
839839
}
840840

841841
isEmpty := resp == nil || resp.IsObjectNull(ctx) || resp.IsResultEmptyish(ctx)
842+
forwardSpan.SetAttributes(attribute.Bool("response.emptyish", isEmpty))
842843
if isEmpty {
843844
lg.Trace().Msgf("response is empty")
844845
}
@@ -868,6 +869,7 @@ func (n *Network) Forward(ctx context.Context, req *common.NormalizedRequest) (*
868869
if upstream != nil {
869870
if mt := upstream.MetricsTracker(); mt != nil {
870871
mt.RecordUpstreamFailure(upstream, method, upstreamErr)
872+
mt.RecordUpstreamMisbehavior(upstream, method)
871873
}
872874
}
873875
return true

erpc/networks_empty_result_shortcircuit_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
6565
MaxAttempts: 3,
6666
Delay: common.Duration(50 * time.Millisecond),
6767
EmptyResultDelay: common.Duration(200 * time.Millisecond),
68-
EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
68+
EmptyResultAccept: common.DefaultEmptyResultAccept(),
6969
},
7070
)
7171

@@ -134,7 +134,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
134134
MaxAttempts: 3,
135135
Delay: common.Duration(50 * time.Millisecond),
136136
EmptyResultDelay: common.Duration(200 * time.Millisecond),
137-
EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
137+
EmptyResultAccept: common.DefaultEmptyResultAccept(),
138138
},
139139
)
140140

@@ -206,7 +206,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
206206
MaxAttempts: 2,
207207
Delay: common.Duration(10 * time.Millisecond),
208208
EmptyResultDelay: common.Duration(10 * time.Millisecond),
209-
EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
209+
EmptyResultAccept: common.DefaultEmptyResultAccept(),
210210
},
211211
)
212212

@@ -269,7 +269,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
269269
MaxAttempts: 3,
270270
Delay: common.Duration(50 * time.Millisecond),
271271
EmptyResultDelay: common.Duration(200 * time.Millisecond),
272-
EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
272+
EmptyResultAccept: common.DefaultEmptyResultAccept(),
273273
},
274274
)
275275

erpc/networks_failsafe_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ func TestNetworkFailsafe_RetryEmpty(t *testing.T) {
225225
},
226226
&common.RetryPolicyConfig{
227227
MaxAttempts: 3,
228-
EmptyResultAccept: []string{"eth_getLogs", "eth_call"}, // eth_getTransactionByHash not in list
228+
EmptyResultAccept: common.DefaultEmptyResultAccept(), // eth_getTransactionByHash not in list
229229
},
230230
)
231231

erpc/networks_registry.go

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,6 @@ import (
1818
"github.com/rs/zerolog"
1919
)
2020

21-
// defaultEmptyResultAccept is the set of methods for which the first emptyish
22-
// result short-circuits the upstream loop (and tells failsafe not to retry).
23-
// Must stay in sync with the default in upstream/failsafe.go.
24-
var defaultEmptyResultAccept = []string{"eth_getLogs", "eth_call"}
25-
2621
type NetworksRegistry struct {
2722
project *PreparedProject
2823
appCtx context.Context
@@ -114,7 +109,7 @@ func NewNetwork(
114109
method = "*"
115110
}
116111

117-
emptyAccept := defaultEmptyResultAccept
112+
emptyAccept := common.DefaultEmptyResultAccept()
118113
if fsCfg.Retry != nil && fsCfg.Retry.EmptyResultAccept != nil {
119114
emptyAccept = fsCfg.Retry.EmptyResultAccept
120115
}
@@ -137,7 +132,7 @@ func NewNetwork(
137132
executor: failsafe.NewExecutor[*common.NormalizedResponse](),
138133
timeout: nil,
139134
consensusPolicyEnabled: false,
140-
emptyResultAccept: defaultEmptyResultAccept,
135+
emptyResultAccept: common.DefaultEmptyResultAccept(),
141136
})
142137

143138
lg.Debug().Interface("config", nwCfg.Failsafe).Msgf("created %d failsafe executors", len(failsafeExecutors))

erpc/networks_retry_missing_data_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1834,7 +1834,7 @@ func TestNetworkForward_EmptyResultAccept_StopsRetryForAcceptedMethod(t *testing
18341834
MaxAttempts: 5,
18351835
Delay: common.Duration(50 * time.Millisecond),
18361836
EmptyResultDelay: common.Duration(100 * time.Millisecond),
1837-
EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
1837+
EmptyResultAccept: common.DefaultEmptyResultAccept(),
18381838
},
18391839
)
18401840

0 commit comments

Comments
 (0)