ratelimits: stricter() should always prefer denied decisions (#8674)

beautifulentropy · web-flow · commit e7eb105266cd · 2026-03-17T14:03:36.000-07:00
In ratelimits.BatchSpend(), stricter() selects the most restrictive
decision across all rate limits in a batch. It compares decisions solely
by retryIn, assuming a longer wait is always stricter, and never checks
the allowed field. An allowed decision with a high retryIn beats a
denied decision with a low retryIn, causing the batch to return allowed:
true despite one or more limits denying the request.

This can occur at NewOrder time via checkNewOrderLimits(), which
batches:

- NewOrdersPerAccount (check-and-spend, emissionInterval=36s)
- FailedAuthorizationsPerDomainPerAccount (check-only,
emissionInterval=12min)
- CertificatesPerDomain (check-only, emissionInterval=3.36h)
- CertificatesPerFQDNSet (check-only, emissionInterval=33.6h)

A subscriber who exhausts their 300 new order quota gets a denied
retryIn of at most 36s. If any check-only limit in the same batch has
exactly 1 token remaining at request time, that last token produces an
allowed decision with retryIn equal to the limit's full emission
interval, all of which dwarf 36s. Because check-only transactions never
deduct from the bucket, the token is never consumed and the same
oversized retryIn wins on every subsequent request. Orders can pile up
well past the 300 limit until countCertificateIssued() runs a separate
spend-only batch after issuance.

This can also occur for IPv6 clients at NewAccount time via
checkNewAccountLimits(), which batches:

- NewRegistrationsPerIPAddress (check-and-spend,emissionInterval =
18min)
- NewRegistrationsPerIPv6Range (check-and-spend,emissionInterval =
21.6s)

These are both check-and-spend with the same 3 hour period. The emission
intervals differ (18min vs 21.6s), so the bug is technically possible
here too. For IPv6, the per-IP limit (10) will almost always exhaust
before the /48 range limit (500), so the reverse scenario (per-range
denied, per-IP allowed) is unlikely but not impossible if many IPs in
the same /48 are registering. If NewRegistrationsPerIPv6Range denies
(retryIn = 21.6s) while NewRegistrationsPerIPAddress allows with 1
remaining (retryIn = 18min), stricter() picks the allowed decision. Both
transactions are check-and-spend, so the request that sneaks through
deducts from the per-IP bucket, exhausting it. After that, both limits
deny and the bug no longer triggers, unlike NewOrder.

Restructure stricter() so denied decisions are always picked over
allowed ones regardless of retryIn. The retryIn and remaining
comparisons now serve only as tiebreakers within the same allowed/denied
category.
diff --git a/ratelimits/limiter.go b/ratelimits/limiter.go
@@ -238,17 +238,27 @@ func prepareBatch(txns []Transaction) ([]Transaction, []string, error) {
 	return transactions, bucketKeys, nil
 }
 
-func stricter(existing *Decision, incoming *Decision) *Decision {
-	if existing.retryIn == incoming.retryIn {
-		if existing.remaining < incoming.remaining {
-			return existing
+func stricter(a, b *Decision) *Decision {
+	switch {
+	case a.allowed != b.allowed:
+		// Denied is always stricter than allowed.
+		if !a.allowed {
+			return a
 		}
-		return incoming
-	}
-	if existing.retryIn > incoming.retryIn {
-		return existing
+		return b
+	case a.retryIn != b.retryIn:
+		// Longer wait is stricter.
+		if a.retryIn > b.retryIn {
+			return a
+		}
+		return b
+	default:
+		// Fewer remaining is stricter.
+		if a.remaining < b.remaining {
+			return a
+		}
+		return b
 	}
-	return incoming
 }
 
 // BatchSpend attempts to deduct the costs from the provided buckets'
diff --git a/ratelimits/limiter_test.go b/ratelimits/limiter_test.go
@@ -610,3 +610,63 @@ func TestRateLimitError(t *testing.T) {
 		})
 	}
 }
+
+func TestStricterDeniedBeatsAllowed(t *testing.T) {
+	t.Parallel()
+
+	clk := clock.NewFake()
+	l := newInmemTestLimiter(t, clk)
+	ctx := context.Background()
+
+	// Limit A, our fast limit, permits 2 requests per second.
+	limitA := &Limit{
+		Burst:  2,
+		Count:  2,
+		Period: config.Duration{Duration: time.Second},
+		Name:   NewRegistrationsPerIPAddress,
+	}
+	limitA.precompute()
+
+	// Limit B, our slow limit, permits 2 requests per hour. An allowed decision
+	// from this limit will have a retryIn up to 30 minutes, far exceeding any
+	// retryIn from the denied limit A.
+	limitB := &Limit{
+		Burst:  2,
+		Count:  2,
+		Period: config.Duration{Duration: time.Hour},
+		Name:   NewRegistrationsPerIPv6Range,
+	}
+	limitB.precompute()
+
+	bucketKeyA := "limitA:testkey"
+	bucketKeyB := "limitB:testkey"
+
+	// Exhaust limit A's bucket completely.
+	txnA2, err := newTransaction(limitA, bucketKeyA, 2)
+	test.AssertNotError(t, err, "Txn should be valid")
+	d, err := l.Spend(ctx, txnA2)
+	test.AssertNotError(t, err, "Should not error")
+	test.Assert(t, d.allowed, "Initial spend should be allowed")
+	test.AssertEquals(t, d.remaining, int64(0))
+
+	// Spend 1 from limit B so it's reduced to 1 remaining.
+	txnB1, err := newTransaction(limitB, bucketKeyB, 1)
+	test.AssertNotError(t, err, "Txn should be valid")
+	d, err = l.Spend(ctx, txnB1)
+	test.AssertNotError(t, err, "Should not error")
+	test.Assert(t, d.allowed, "Initial spend should be allowed")
+	test.AssertEquals(t, d.remaining, int64(1))
+
+	// Now batch, limit A should deny (0 remaining), limit B should allow (1
+	// remaining) but with a large retryIn (~30 minutes).
+	txnA1, err := newTransaction(limitA, bucketKeyA, 1)
+	test.AssertNotError(t, err, "Txn should be valid")
+	txnB1, err = newTransaction(limitB, bucketKeyB, 1)
+	test.AssertNotError(t, err, "Txn should be valid")
+
+	d, err = l.BatchSpend(ctx, []Transaction{txnA1, txnB1})
+	test.AssertNotError(t, err, "Should not error")
+
+	// The batch MUST be denied because limit A denied the request.
+	test.Assert(t, !d.allowed, "Batch should be denied when any limit denies")
+}