@@ -1168,6 +1168,140 @@ func testV3LeaseTimeToLiveWithLeaderChanged(t *testing.T, fpName string) {
11681168 require .NoError (t , <- errCh )
11691169}
11701170
1171+ // TestV3LeaseKeysDeletedBeforeExpiry tests the invariant that if a KeepAlive
1172+ // returns TTL > 0, the attached keys must still exist.
1173+ //
1174+ // It reproduces the race condition described in
1175+ // https://github.com/etcd-io/etcd/issues/14758 by letting the lease expire
1176+ // while a KeepAlive is in-flight and delayed by a failpoint:
1177+ //
1178+ // 1. Client sends KeepAlive ~2s before lease expires.
1179+ // 2. LeaseRenew → waitAppliedIndex (succeeds, nothing pending) → Renew()
1180+ // passes l.expired() check → hits beforeCheckpointInLeaseRenew → sleeps.
1181+ // 3. Lease expires. revokeExpiredLeases() detects it, proposes LeaseRevoke
1182+ // through Raft.
1183+ // 4. Raft commits. Apply goroutine calls Revoke() → deletes keys →
1184+ // hits afterLeaseRevokeDeleteKeys → pauses before removing from leaseMap.
1185+ // 5. Renew wakes up, acquires Lock, finds lease in leaseMap.
1186+ // 6. Client receives KeepAlive response.
1187+ // 7. Revoke resumes, removes lease from leaseMap, apply unblocks.
1188+ //
1189+ // The invariant: if the KeepAlive response has TTL > 0, the keys must still
1190+ // exist. If TTL ≤ 0, the keys may or may not exist.
1191+ func TestV3LeaseKeysDeletedBeforeExpiry (t * testing.T ) {
1192+ integration .SkipIfNoGoFail (t )
1193+ integration .BeforeTest (t )
1194+
1195+ clus := integration .NewCluster (t , & integration.ClusterConfig {Size : 1 })
1196+ defer clus .Terminate (t )
1197+
1198+ ctx , cancel := context .WithTimeout (t .Context (), 60 * time .Second )
1199+ defer cancel ()
1200+
1201+ client := clus .RandClient ()
1202+ lc := integration .ToGRPC (client ).Lease
1203+ kvc := integration .ToGRPC (client ).KV
1204+
1205+ // Grant a lease with 10s TTL to give the race enough time to develop.
1206+ lresp , err := lc .LeaseGrant (ctx , & pb.LeaseGrantRequest {TTL : 10 })
1207+ require .NoError (t , err )
1208+ leaseID := lresp .ID
1209+ t .Logf ("Lease %x granted with TTL=%d" , leaseID , lresp .TTL )
1210+
1211+ // Attach keys to the lease.
1212+ numKeys := 3
1213+ for i := range numKeys {
1214+ _ , err := kvc .Put (ctx , & pb.PutRequest {
1215+ Key : []byte (fmt .Sprintf ("lease-key-%d" , i )),
1216+ Value : []byte (fmt .Sprintf ("val-%d" , i )),
1217+ Lease : leaseID ,
1218+ })
1219+ require .NoError (t , err )
1220+ }
1221+
1222+ // Verify keys are attached.
1223+ ttlResp , err := lc .LeaseTimeToLive (ctx , & pb.LeaseTimeToLiveRequest {ID : leaseID , Keys : true })
1224+ require .NoError (t , err )
1225+ require .Len (t , ttlResp .Keys , numKeys )
1226+
1227+ // Open a KeepAlive stream early (before enabling failpoints).
1228+ lac , err := lc .LeaseKeepAlive (ctx )
1229+ require .NoError (t , err )
1230+
1231+ // Enable failpoints to widen the race window:
1232+ //
1233+ // beforeCheckpointInLeaseRenew: delays Renew() by 5s AFTER it passes
1234+ // l.expired() but BEFORE it acquires Lock to refresh.
1235+ //
1236+ // afterLeaseRevokeDeleteKeys: delays Revoke() by 5s AFTER deleting
1237+ // keys but BEFORE removing lease from leaseMap.
1238+ //
1239+ // Timeline:
1240+ // t=0s: Lease granted (TTL=10s)
1241+ // t=8s: KeepAlive sent → Renew passes expired() → sleeps 5s
1242+ // t=10s: Lease expires
1243+ // t=11-12s: revokeExpiredLeases → Raft → Revoke → deletes keys → pauses 5s
1244+ // t=13s: Renew wakes → finds lease in leaseMap → returns response
1245+ // t=16-17s: Revoke resumes → lease deleted → apply unblocks
1246+ require .NoError (t , gofail .Enable ("beforeCheckpointInLeaseRenew" , `sleep("5s")` ))
1247+ require .NoError (t , gofail .Enable ("afterLeaseRevokeDeleteKeys" , `sleep("5s")` ))
1248+ t .Cleanup (func () {
1249+ _ = gofail .Disable ("beforeCheckpointInLeaseRenew" )
1250+ _ = gofail .Disable ("afterLeaseRevokeDeleteKeys" )
1251+ })
1252+
1253+ // Wait until ~2s before lease expires, then send KeepAlive.
1254+ // Renew() will pass the expired() check (lease has ~2s remaining)
1255+ // and then sleep 5s at the failpoint.
1256+ time .Sleep (time .Duration (lresp .TTL - 2 ) * time .Second )
1257+ keepAliveSendTime := time .Now ()
1258+ require .NoError (t , lac .Send (& pb.LeaseKeepAliveRequest {ID : leaseID }))
1259+ t .Logf ("KeepAlive sent at ~%ds (lease has ~2s remaining)" , lresp .TTL - 2 )
1260+
1261+ // Disable failpoints when done so revoke can complete during cleanup.
1262+ defer func () {
1263+ _ = gofail .Disable ("afterLeaseRevokeDeleteKeys" )
1264+ _ = gofail .Disable ("beforeCheckpointInLeaseRenew" )
1265+ }()
1266+
1267+ // Receive the KeepAlive response.
1268+ kaResp , err := lac .Recv ()
1269+ require .NoError (t , err )
1270+ t .Logf ("KeepAlive response: ID=%x, TTL=%d" , kaResp .ID , kaResp .TTL )
1271+ require .Equal (t , leaseID , kaResp .ID )
1272+
1273+ // If the renewal returned expired, the invariant holds trivially.
1274+ if kaResp .TTL <= 0 {
1275+ t .Log ("KeepAlive returned TTL=0: lease correctly reported as expired" )
1276+ return
1277+ }
1278+
1279+ // Lease reported as alive. Read keys immediately.
1280+ leaseDeadline := keepAliveSendTime .Add (time .Duration (kaResp .TTL ) * time .Second )
1281+ var keysRemaining int
1282+ for i := range numKeys {
1283+ rresp , rerr := kvc .Range (ctx , & pb.RangeRequest {
1284+ Key : []byte (fmt .Sprintf ("lease-key-%d" , i )),
1285+ })
1286+ require .NoError (t , rerr )
1287+ keysRemaining += len (rresp .Kvs )
1288+ }
1289+ readCompleteTime := time .Now ()
1290+ t .Logf ("Keys remaining: %d/%d (read completed at %v, lease should be valid until %v)" ,
1291+ keysRemaining , numKeys , readCompleteTime , leaseDeadline )
1292+
1293+ // If the read completed after the lease deadline, we cannot
1294+ // draw conclusions about the invariant. Skip.
1295+ if ! readCompleteTime .Before (leaseDeadline ) {
1296+ t .Skip ("key read completed after lease deadline; cannot verify invariant" )
1297+ }
1298+
1299+ // The read completed within the lease validity window.
1300+ // Keys MUST still exist.
1301+ require .Equal (t , numKeys , keysRemaining ,
1302+ "KeepAlive returned TTL > 0 but keys are missing — lease invariant violated" )
1303+ }
1304+
11711305// acquireLeaseAndKey creates a new lease and creates an attached key.
11721306func acquireLeaseAndKey (clus * integration.Cluster , key string ) (int64 , error ) {
11731307 // create lease
0 commit comments