Skip to content

Commit b76056d

Browse files
liyishuaiclaude
andcommitted
lease: add integration test for lease renew/revoke race (Issue 14758)
Add TestV3LeaseKeysDeletedBeforeExpiry to test the invariant: if KeepAlive returns TTL > 0, the attached keys must still exist within the lease validity window. The test uses two failpoints (beforeCheckpointInLeaseRenew and afterLeaseRevokeDeleteKeys) to widen the race window between Renew and Revoke. It lets the lease expire while a KeepAlive is delayed, then checks whether the invariant holds. Also adds the afterLeaseRevokeDeleteKeys gofail failpoint to lessor.Revoke(), placed between key deletion and leaseMap removal. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Yishuai Li <yishuai.li@pingcap.com>
1 parent b5267c2 commit b76056d

File tree

2 files changed

+136
-0
lines changed

2 files changed

+136
-0
lines changed

server/lease/lessor.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,8 @@ func (le *lessor) Revoke(id LeaseID) error {
350350
txn.DeleteRange([]byte(key), nil)
351351
}
352352

353+
// gofail: var afterLeaseRevokeDeleteKeys struct{}
354+
353355
le.mu.Lock()
354356
defer le.mu.Unlock()
355357
delete(le.leaseMap, l.ID)

tests/integration/v3_lease_test.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,6 +1168,140 @@ func testV3LeaseTimeToLiveWithLeaderChanged(t *testing.T, fpName string) {
11681168
require.NoError(t, <-errCh)
11691169
}
11701170

1171+
// TestV3LeaseKeysDeletedBeforeExpiry tests the invariant that if a KeepAlive
1172+
// returns TTL > 0, the attached keys must still exist.
1173+
//
1174+
// It reproduces the race condition described in
1175+
// https://github.com/etcd-io/etcd/issues/14758 by letting the lease expire
1176+
// while a KeepAlive is in-flight and delayed by a failpoint:
1177+
//
1178+
// 1. Client sends KeepAlive ~2s before lease expires.
1179+
// 2. LeaseRenew → waitAppliedIndex (succeeds, nothing pending) → Renew()
1180+
// passes l.expired() check → hits beforeCheckpointInLeaseRenew → sleeps.
1181+
// 3. Lease expires. revokeExpiredLeases() detects it, proposes LeaseRevoke
1182+
// through Raft.
1183+
// 4. Raft commits. Apply goroutine calls Revoke() → deletes keys →
1184+
// hits afterLeaseRevokeDeleteKeys → pauses before removing from leaseMap.
1185+
// 5. Renew wakes up, acquires Lock, finds lease in leaseMap.
1186+
// 6. Client receives KeepAlive response.
1187+
// 7. Revoke resumes, removes lease from leaseMap, apply unblocks.
1188+
//
1189+
// The invariant: if the KeepAlive response has TTL > 0, the keys must still
1190+
// exist. If TTL ≤ 0, the keys may or may not exist.
1191+
func TestV3LeaseKeysDeletedBeforeExpiry(t *testing.T) {
1192+
integration.SkipIfNoGoFail(t)
1193+
integration.BeforeTest(t)
1194+
1195+
clus := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
1196+
defer clus.Terminate(t)
1197+
1198+
ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second)
1199+
defer cancel()
1200+
1201+
client := clus.RandClient()
1202+
lc := integration.ToGRPC(client).Lease
1203+
kvc := integration.ToGRPC(client).KV
1204+
1205+
// Grant a lease with 10s TTL to give the race enough time to develop.
1206+
lresp, err := lc.LeaseGrant(ctx, &pb.LeaseGrantRequest{TTL: 10})
1207+
require.NoError(t, err)
1208+
leaseID := lresp.ID
1209+
t.Logf("Lease %x granted with TTL=%d", leaseID, lresp.TTL)
1210+
1211+
// Attach keys to the lease.
1212+
numKeys := 3
1213+
for i := range numKeys {
1214+
_, err := kvc.Put(ctx, &pb.PutRequest{
1215+
Key: []byte(fmt.Sprintf("lease-key-%d", i)),
1216+
Value: []byte(fmt.Sprintf("val-%d", i)),
1217+
Lease: leaseID,
1218+
})
1219+
require.NoError(t, err)
1220+
}
1221+
1222+
// Verify keys are attached.
1223+
ttlResp, err := lc.LeaseTimeToLive(ctx, &pb.LeaseTimeToLiveRequest{ID: leaseID, Keys: true})
1224+
require.NoError(t, err)
1225+
require.Len(t, ttlResp.Keys, numKeys)
1226+
1227+
// Open a KeepAlive stream early (before enabling failpoints).
1228+
lac, err := lc.LeaseKeepAlive(ctx)
1229+
require.NoError(t, err)
1230+
1231+
// Enable failpoints to widen the race window:
1232+
//
1233+
// beforeCheckpointInLeaseRenew: delays Renew() by 5s AFTER it passes
1234+
// l.expired() but BEFORE it acquires Lock to refresh.
1235+
//
1236+
// afterLeaseRevokeDeleteKeys: delays Revoke() by 5s AFTER deleting
1237+
// keys but BEFORE removing lease from leaseMap.
1238+
//
1239+
// Timeline:
1240+
// t=0s: Lease granted (TTL=10s)
1241+
// t=8s: KeepAlive sent → Renew passes expired() → sleeps 5s
1242+
// t=10s: Lease expires
1243+
// t=11-12s: revokeExpiredLeases → Raft → Revoke → deletes keys → pauses 5s
1244+
// t=13s: Renew wakes → finds lease in leaseMap → returns response
1245+
// t=16-17s: Revoke resumes → lease deleted → apply unblocks
1246+
require.NoError(t, gofail.Enable("beforeCheckpointInLeaseRenew", `sleep("5s")`))
1247+
require.NoError(t, gofail.Enable("afterLeaseRevokeDeleteKeys", `sleep("5s")`))
1248+
t.Cleanup(func() {
1249+
_ = gofail.Disable("beforeCheckpointInLeaseRenew")
1250+
_ = gofail.Disable("afterLeaseRevokeDeleteKeys")
1251+
})
1252+
1253+
// Wait until ~2s before lease expires, then send KeepAlive.
1254+
// Renew() will pass the expired() check (lease has ~2s remaining)
1255+
// and then sleep 5s at the failpoint.
1256+
time.Sleep(time.Duration(lresp.TTL-2) * time.Second)
1257+
keepAliveSendTime := time.Now()
1258+
require.NoError(t, lac.Send(&pb.LeaseKeepAliveRequest{ID: leaseID}))
1259+
t.Logf("KeepAlive sent at ~%ds (lease has ~2s remaining)", lresp.TTL-2)
1260+
1261+
// Disable failpoints when done so revoke can complete during cleanup.
1262+
defer func() {
1263+
_ = gofail.Disable("afterLeaseRevokeDeleteKeys")
1264+
_ = gofail.Disable("beforeCheckpointInLeaseRenew")
1265+
}()
1266+
1267+
// Receive the KeepAlive response.
1268+
kaResp, err := lac.Recv()
1269+
require.NoError(t, err)
1270+
t.Logf("KeepAlive response: ID=%x, TTL=%d", kaResp.ID, kaResp.TTL)
1271+
require.Equal(t, leaseID, kaResp.ID)
1272+
1273+
// If the renewal returned expired, the invariant holds trivially.
1274+
if kaResp.TTL <= 0 {
1275+
t.Log("KeepAlive returned TTL=0: lease correctly reported as expired")
1276+
return
1277+
}
1278+
1279+
// Lease reported as alive. Read keys immediately.
1280+
leaseDeadline := keepAliveSendTime.Add(time.Duration(kaResp.TTL) * time.Second)
1281+
var keysRemaining int
1282+
for i := range numKeys {
1283+
rresp, rerr := kvc.Range(ctx, &pb.RangeRequest{
1284+
Key: []byte(fmt.Sprintf("lease-key-%d", i)),
1285+
})
1286+
require.NoError(t, rerr)
1287+
keysRemaining += len(rresp.Kvs)
1288+
}
1289+
readCompleteTime := time.Now()
1290+
t.Logf("Keys remaining: %d/%d (read completed at %v, lease should be valid until %v)",
1291+
keysRemaining, numKeys, readCompleteTime, leaseDeadline)
1292+
1293+
// If the read completed after the lease deadline, we cannot
1294+
// draw conclusions about the invariant. Skip.
1295+
if !readCompleteTime.Before(leaseDeadline) {
1296+
t.Skip("key read completed after lease deadline; cannot verify invariant")
1297+
}
1298+
1299+
// The read completed within the lease validity window.
1300+
// Keys MUST still exist.
1301+
require.Equal(t, numKeys, keysRemaining,
1302+
"KeepAlive returned TTL > 0 but keys are missing — lease invariant violated")
1303+
}
1304+
11711305
// acquireLeaseAndKey creates a new lease and creates an attached key.
11721306
func acquireLeaseAndKey(clus *integration.Cluster, key string) (int64, error) {
11731307
// create lease

0 commit comments

Comments
 (0)