rhinestonewtf
diff --git a/‎architecture/evm/hooks_test.go‎
Lines changed: 3 additions & 3 deletions b/‎architecture/evm/hooks_test.go‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎common/defaults.go‎
Lines changed: 34 additions & 24 deletions b/‎common/defaults.go‎
Lines changed: 34 additions & 24 deletions
diff --git a/‎common/defaults_test.go‎
Lines changed: 12 additions & 10 deletions b/‎common/defaults_test.go‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎docs/hedge-cancel-on-error.md‎
Lines changed: 76 additions & 0 deletions b/‎docs/hedge-cancel-on-error.md‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎erpc/networks.go‎
Lines changed: 2 additions & 0 deletions b/‎erpc/networks.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎erpc/networks_empty_result_shortcircuit_test.go‎
Lines changed: 4 additions & 4 deletions b/‎erpc/networks_empty_result_shortcircuit_test.go‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎erpc/networks_failsafe_test.go‎
Lines changed: 1 addition & 1 deletion b/‎erpc/networks_failsafe_test.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎erpc/networks_registry.go‎
Lines changed: 2 additions & 7 deletions b/‎erpc/networks_registry.go‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎erpc/networks_retry_missing_data_test.go‎
Lines changed: 1 addition & 1 deletion b/‎erpc/networks_retry_missing_data_test.go‎
Lines changed: 1 addition & 1 deletion
@@ -41,7 +41,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_ListedMethods(t *testing.T) {
 	}
 
 	// Create a test network with the default methods configured
-	network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)
+	network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())
 
 	for _, m := range methods {
 		// Build a minimal request with method m
@@ -82,7 +82,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_RetryEmptyFalse(t *testing.T) {
 	}
 
 	// Create a test network with the default methods configured
-	network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)
+	network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())
 
 	for _, m := range methods {
 		// Build a minimal request with method m
@@ -125,7 +125,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_NonListedMethods(t *testing.T) {
 	}
 
 	// Create a test network with the default methods configured
-	network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)
+	network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())
 
 	for _, m := range methods {
 		// Build a minimal request with method m
 
@@ -1908,29 +1908,36 @@ func (n *NetworkConfig) SetDefaults(upstreams []*UpstreamConfig, defaults *Netwo
 const DefaultEvmFinalityDepth = 1024
 const DefaultEvmStatePollerDebounce = Duration(5 * time.Second)
 
-// DefaultMarkEmptyAsErrorMethods lists the default methods for which empty/null results
-// should be treated as "missing data" errors, triggering retry on other upstreams.
-// Note: eth_getBlockByHash is intentionally excluded because subgraph-based upstreams
-// commonly return empty for this method, which is expected behavior.
-// Note: eth_getTransactionReceipt is excluded as a quick remedy. Ideally we'd only allow null
-// for pending txs (historical txs should retry on other upstreams). The "retry on empty" directive
-// can still be used since some nodes may already have the receipt.
-var DefaultMarkEmptyAsErrorMethods = []string{
-	// Block lookups (eth_getBlockByHash excluded - subgraphs return empty for it)
-	"eth_getBlockByNumber",
-	"eth_getBlockReceipts",
-	// Transaction lookups (eth_getTransactionReceipt excluded - see note above)
-	"eth_getTransactionByHash",
-	"eth_getTransactionByBlockHashAndIndex",
-	"eth_getTransactionByBlockNumberAndIndex",
-	// Uncle/ommers (legacy API)
-	"eth_getUncleByBlockHashAndIndex",
-	"eth_getUncleByBlockNumberAndIndex",
-	// Traces (debug/trace/parity modules)
-	"debug_traceTransaction",
-	"trace_transaction",
-	"trace_block",
-	"trace_get",
+// DefaultEmptyResultAccept returns a fresh copy of the methods for which an
+// empty/null result is considered valid (e.g. eth_getLogs, eth_call). A new
+// slice is returned on every call so callers cannot mutate the shared default.
+func DefaultEmptyResultAccept() []string {
+	return []string{"eth_getLogs", "eth_call"}
+}
+
+// DefaultMarkEmptyAsErrorMethods returns a fresh copy of the methods for which
+// empty/null results should be treated as "missing data" errors, triggering retry
+// on other upstreams. A new slice is returned on every call so callers cannot
+// mutate the shared default.
+//
+// Note: eth_getBlockByHash is intentionally excluded because subgraph-based
+// upstreams commonly return empty for this method, which is expected behavior.
+// Note: eth_getTransactionReceipt is excluded as a quick remedy. Ideally we'd
+// only allow null for pending txs.
+func DefaultMarkEmptyAsErrorMethods() []string {
+	return []string{
+		"eth_getBlockByNumber",
+		"eth_getBlockReceipts",
+		"eth_getTransactionByHash",
+		"eth_getTransactionByBlockHashAndIndex",
+		"eth_getTransactionByBlockNumberAndIndex",
+		"eth_getUncleByBlockHashAndIndex",
+		"eth_getUncleByBlockNumberAndIndex",
+		"debug_traceTransaction",
+		"trace_transaction",
+		"trace_block",
+		"trace_get",
+	}
 }
 
 func (e *EvmNetworkConfig) SetDefaults() error {
@@ -1960,7 +1967,7 @@ func (e *EvmNetworkConfig) SetDefaults() error {
 
 	// Default methods for marking empty results as errors
 	if e.MarkEmptyAsErrorMethods == nil {
-		e.MarkEmptyAsErrorMethods = DefaultMarkEmptyAsErrorMethods
+		e.MarkEmptyAsErrorMethods = DefaultMarkEmptyAsErrorMethods()
 	}
 
 	return nil
@@ -2122,6 +2129,9 @@ func (r *RetryPolicyConfig) SetDefaults(defaults *RetryPolicyConfig) error {
 			r.EmptyResultAccept = defaults.EmptyResultIgnore
 		}
 	}
+	if r.EmptyResultAccept == nil {
+		r.EmptyResultAccept = DefaultEmptyResultAccept()
+	}
 
 	// Default EmptyResultMaxAttempts to MaxAttempts if not set
 	if r.EmptyResultMaxAttempts == 0 {
 
@@ -104,11 +104,12 @@ func TestSetDefaults_NetworkConfig(t *testing.T) {
 		assert.EqualValues(t, &FailsafeConfig{
 			MatchMethod: "*",
 			Retry: &RetryPolicyConfig{
-				MaxAttempts:     12345,
-				Delay:           Duration(0 * time.Millisecond),
-				BackoffMaxDelay: Duration(3 * time.Second),
-				BackoffFactor:   1.2,
-				Jitter:          Duration(0 * time.Millisecond),
+				MaxAttempts:       12345,
+				Delay:             Duration(0 * time.Millisecond),
+				BackoffMaxDelay:   Duration(3 * time.Second),
+				BackoffFactor:     1.2,
+				Jitter:            Duration(0 * time.Millisecond),
+				EmptyResultAccept: DefaultEmptyResultAccept(),
 			},
 		}, network.Failsafe[0])
 		assert.Nil(t, network.Failsafe[0].Timeout)
@@ -359,11 +360,12 @@ func TestSetDefaults_UpstreamConfig(t *testing.T) {
 		// Verify failsafe retry is only applied to the first upstream
 		retry := cfg.Projects[0].Upstreams[0].Failsafe[0].Retry
 		assert.EqualValues(t, &RetryPolicyConfig{
-			MaxAttempts:     2,
-			BackoffMaxDelay: Duration(10 * time.Second),
-			Delay:           Duration(1 * time.Second),
-			Jitter:          Duration(500 * time.Millisecond),
-			BackoffFactor:   1.2,
+			MaxAttempts:       2,
+			BackoffMaxDelay:   Duration(10 * time.Second),
+			Delay:             Duration(1 * time.Second),
+			Jitter:            Duration(500 * time.Millisecond),
+			BackoffFactor:     1.2,
+			EmptyResultAccept: DefaultEmptyResultAccept(),
 		}, retry, "Retry policy should match expected values")
 
 		assert.Nil(t, cfg.Projects[0].Upstreams[0].Failsafe[0].CircuitBreaker, "Circuit breaker should be nil because this upstream has failsafe defined")
 
@@ -0,0 +1,76 @@
+# Hedge: "Don't cancel on first error" exploration
+
+## Current behavior
+
+- **CancelIf** in the hedge policy cancels other hedges when:
+  - This execution returns **any** non-exhaustion error, or
+  - This execution returns an **accepted** result (non-empty, or empty but in `emptyResultAccept` and not consensus).
+- We explicitly **do not** cancel on `ErrUpstreamsExhausted` / `ErrCodeNoUpstreamsLeftToSelect` so other hedges can still finish.
+
+So: "first to return (result or error) wins" — we cancel as soon as one execution returns, unless that return is exhaustion.
+
+## Why "don't cancel on first error" was not the default
+
+1. **Latency when all upstreams fail**  
+   If we only cancelled on success, then when every hedge fails we would wait for the **slowest** execution instead of returning as soon as the first one fails. Example: Alchemy errors in 100ms, QuickNode in 3s → today we return in ~100ms; if we stopped cancelling on error we’d wait ~3s for the same outcome.
+
+2. **Original mental model**  
+   With a single execution (no hedge), "first response" is the only response. Adding hedge kept the same idea: "first response (success or failure) wins" to avoid extra wait when the first response is already a failure.
+
+3. **Loop-over-upstreams**  
+   Each execution runs a **loop** over upstreams (`maxLoopIterations` = 1 in consensus, `UpstreamsCount()` otherwise). With hedge, executions **share** `ConsumedUpstreams`, so:
+   - Execution 1 often gets upstream A, execution 2 gets B.
+   - Execution 1 can return after **one** upstream (e.g. error from A, then next `NextUpstream` hits duplicate or exhausted and the loop breaks).
+   - So "first return" is often "first error from one upstream", not "tried everything". Letting that first error cancel the other hedge (e.g. QuickNode) is what causes the trace_filter case: Alchemy errors fast, we cancel QuickNode which would have succeeded in 2–3s.
+
+So the loop doesn’t remove the need for "don’t cancel on first error"; it’s what makes the current "cancel on any return" strict — one execution can exit quickly with an error and kill the other.
+
+## Side effects of "don’t cancel on first error"
+
+| Scenario | Current (cancel on any return) | Don’t cancel on error |
+|--------|--------------------------------|------------------------|
+| First returns **success** | Other hedges cancelled ✓ | Same ✓ |
+| First returns **error**, another would succeed | Other hedges cancelled ✗ (e.g. QuickNode discarded) | Other can complete ✓ |
+| All return **errors** | Return after first error (low latency) ✓ | Wait for slowest (higher latency) ✗ |
+| First returns **client/execution** error (same everywhere) | Other hedges cancelled, we return quickly ✓ | We’d still wait for others for no benefit ✗ |
+
+So a good refinement is: **cancel only on terminal errors**, not on every error.
+
+- **Terminal errors** (cancel others): same on every upstream, no need to wait for more.
+  - `IsClientError(err)` (bad request, range exceeded, etc.)
+  - `ErrCodeEndpointExecutionException` (e.g. revert)
+- **Non-terminal errors** (don’t cancel): method not supported, 5xx, timeout, missing data, etc. — another hedge might succeed.
+
+## Recommended refinement
+
+In `CancelIf`, instead of:
+
+```go
+if err != nil {
+    return true  // cancel on any error
+}
+```
+
+use:
+
+```go
+if err != nil {
+    // Cancel only on terminal errors; let other hedges complete on transient/upstream-specific errors.
+    if common.IsClientError(err) || common.HasErrorCode(err, common.ErrCodeEndpointExecutionException) {
+        return true
+    }
+    return false
+}
+```
+
+Effects:
+
+- **trace_filter**: Alchemy returns "method not supported" → we don’t cancel → QuickNode can return success (or its own error) → we use the first success or aggregate failures.
+- **All fail**: We wait for all hedges to finish, then failsafe/retry sees the errors. Latency is max of hedge durations instead of min; acceptable if we prefer success when any single hedge can succeed.
+- **Client/revert**: We still cancel on first terminal error and return quickly.
+
+## Summary
+
+- We didn’t avoid "don’t cancel on first error" because of the loop; the loop is why one execution often returns quickly with one upstream’s error and then we cancel the rest.
+- Full "don’t cancel on error" would fix the trace_filter case but worsen latency when every hedge fails.
+- **Cancel only on terminal error** (client + execution exception) keeps latency good for deterministic failures and lets other hedges complete when the first failure is upstream-specific (e.g. method not supported).
@@ -839,6 +839,7 @@ func (n *Network) Forward(ctx context.Context, req *common.NormalizedRequest) (*
 	}
 
 	isEmpty := resp == nil || resp.IsObjectNull(ctx) || resp.IsResultEmptyish(ctx)
+	forwardSpan.SetAttributes(attribute.Bool("response.emptyish", isEmpty))
 	if isEmpty {
 		lg.Trace().Msgf("response is empty")
 	}
@@ -868,6 +869,7 @@ func (n *Network) Forward(ctx context.Context, req *common.NormalizedRequest) (*
 			if upstream != nil {
 				if mt := upstream.MetricsTracker(); mt != nil {
 					mt.RecordUpstreamFailure(upstream, method, upstreamErr)
+					mt.RecordUpstreamMisbehavior(upstream, method)
 				}
 			}
 			return true
 
@@ -65,7 +65,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
 				MaxAttempts:       3,
 				Delay:             common.Duration(50 * time.Millisecond),
 				EmptyResultDelay:  common.Duration(200 * time.Millisecond),
-				EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
+				EmptyResultAccept: common.DefaultEmptyResultAccept(),
 			},
 		)
 
@@ -134,7 +134,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
 				MaxAttempts:       3,
 				Delay:             common.Duration(50 * time.Millisecond),
 				EmptyResultDelay:  common.Duration(200 * time.Millisecond),
-				EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
+				EmptyResultAccept: common.DefaultEmptyResultAccept(),
 			},
 		)
 
@@ -206,7 +206,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
 				MaxAttempts:       2,
 				Delay:             common.Duration(10 * time.Millisecond),
 				EmptyResultDelay:  common.Duration(10 * time.Millisecond),
-				EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
+				EmptyResultAccept: common.DefaultEmptyResultAccept(),
 			},
 		)
 
@@ -269,7 +269,7 @@ func TestEmptyResultAcceptShortCircuit(t *testing.T) {
 				MaxAttempts:       3,
 				Delay:             common.Duration(50 * time.Millisecond),
 				EmptyResultDelay:  common.Duration(200 * time.Millisecond),
-				EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
+				EmptyResultAccept: common.DefaultEmptyResultAccept(),
 			},
 		)
 
 
@@ -225,7 +225,7 @@ func TestNetworkFailsafe_RetryEmpty(t *testing.T) {
 			},
 			&common.RetryPolicyConfig{
 				MaxAttempts:       3,
-				EmptyResultAccept: []string{"eth_getLogs", "eth_call"}, // eth_getTransactionByHash not in list
+				EmptyResultAccept: common.DefaultEmptyResultAccept(), // eth_getTransactionByHash not in list
 			},
 		)
 
 
@@ -18,11 +18,6 @@ import (
 	"github.com/rs/zerolog"
 )
 
-// defaultEmptyResultAccept is the set of methods for which the first emptyish
-// result short-circuits the upstream loop (and tells failsafe not to retry).
-// Must stay in sync with the default in upstream/failsafe.go.
-var defaultEmptyResultAccept = []string{"eth_getLogs", "eth_call"}
-
 type NetworksRegistry struct {
 	project              *PreparedProject
 	appCtx               context.Context
@@ -114,7 +109,7 @@ func NewNetwork(
 				method = "*"
 			}
 
-			emptyAccept := defaultEmptyResultAccept
+			emptyAccept := common.DefaultEmptyResultAccept()
 			if fsCfg.Retry != nil && fsCfg.Retry.EmptyResultAccept != nil {
 				emptyAccept = fsCfg.Retry.EmptyResultAccept
 			}
@@ -137,7 +132,7 @@ func NewNetwork(
 		executor:               failsafe.NewExecutor[*common.NormalizedResponse](),
 		timeout:                nil,
 		consensusPolicyEnabled: false,
-		emptyResultAccept:      defaultEmptyResultAccept,
+		emptyResultAccept:      common.DefaultEmptyResultAccept(),
 	})
 
 	lg.Debug().Interface("config", nwCfg.Failsafe).Msgf("created %d failsafe executors", len(failsafeExecutors))
 
@@ -1834,7 +1834,7 @@ func TestNetworkForward_EmptyResultAccept_StopsRetryForAcceptedMethod(t *testing
 				MaxAttempts:       5,
 				Delay:             common.Duration(50 * time.Millisecond),
 				EmptyResultDelay:  common.Duration(100 * time.Millisecond),
-				EmptyResultAccept: []string{"eth_getLogs", "eth_call"},
+				EmptyResultAccept: common.DefaultEmptyResultAccept(),
 			},
 		)
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_ListedMethods(t *testing.T) {`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`// Create a test network with the default methods configured`
`44`		`- network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)`
	`44`	`+ network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())`
`45`	`45`
`46`	`46`	`for _, m := range methods {`
`47`	`47`	`// Build a minimal request with method m`
`@@ -82,7 +82,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_RetryEmptyFalse(t *testing.T) {`
`82`	`82`	`}`
`83`	`83`
`84`	`84`	`// Create a test network with the default methods configured`
`85`		`- network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)`
	`85`	`+ network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())`
`86`	`86`
`87`	`87`	`for _, m := range methods {`
`88`	`88`	`// Build a minimal request with method m`
`@@ -125,7 +125,7 @@ func TestUpstreamPostForward_UnexpectedEmpty_NonListedMethods(t *testing.T) {`
`125`	`125`	`}`
`126`	`126`
`127`	`127`	`// Create a test network with the default methods configured`
`128`		`- network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods)`
	`128`	`+ network := newTestNetworkWithMarkEmptyMethods(common.DefaultMarkEmptyAsErrorMethods())`
`129`	`129`
`130`	`130`	`for _, m := range methods {`
`131`	`131`	`// Build a minimal request with method m`
Original file line number	Diff line number	Diff line change
`@@ -839,6 +839,7 @@ func (n Network) Forward(ctx context.Context, req common.NormalizedRequest) (*`
`839`	`839`	`}`
`840`	`840`
`841`	`841`	`isEmpty := resp == nil \|\| resp.IsObjectNull(ctx) \|\| resp.IsResultEmptyish(ctx)`
	`842`	`+ forwardSpan.SetAttributes(attribute.Bool("response.emptyish", isEmpty))`
`842`	`843`	`if isEmpty {`
`843`	`844`	`lg.Trace().Msgf("response is empty")`
`844`	`845`	`}`
`@@ -868,6 +869,7 @@ func (n Network) Forward(ctx context.Context, req common.NormalizedRequest) (*`
`868`	`869`	`if upstream != nil {`
`869`	`870`	`if mt := upstream.MetricsTracker(); mt != nil {`
`870`	`871`	`mt.RecordUpstreamFailure(upstream, method, upstreamErr)`
	`872`	`+ mt.RecordUpstreamMisbehavior(upstream, method)`
`871`	`873`	`}`
`872`	`874`	`}`
`873`	`875`	`return true`
Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ func TestNetworkFailsafe_RetryEmpty(t *testing.T) {`
`225`	`225`	`},`
`226`	`226`	`&common.RetryPolicyConfig{`
`227`	`227`	`MaxAttempts: 3,`
`228`		`- EmptyResultAccept: []string{"eth_getLogs", "eth_call"}, // eth_getTransactionByHash not in list`
	`228`	`+ EmptyResultAccept: common.DefaultEmptyResultAccept(), // eth_getTransactionByHash not in list`
`229`	`229`	`},`
`230`	`230`	`)`
`231`	`231`
Original file line number	Diff line number	Diff line change
`@@ -1834,7 +1834,7 @@ func TestNetworkForward_EmptyResultAccept_StopsRetryForAcceptedMethod(t *testing`
`1834`	`1834`	`MaxAttempts: 5,`
`1835`	`1835`	`Delay: common.Duration(50 * time.Millisecond),`
`1836`	`1836`	`EmptyResultDelay: common.Duration(100 * time.Millisecond),`
`1837`		`- EmptyResultAccept: []string{"eth_getLogs", "eth_call"},`
	`1837`	`+ EmptyResultAccept: common.DefaultEmptyResultAccept(),`
`1838`	`1838`	`},`
`1839`	`1839`	`)`
`1840`	`1840`