@@ -288,6 +288,12 @@ impl Lighthouse {
288
288
if quorum_met. is_some ( ) {
289
289
let participants = quorum_met. unwrap ( ) ;
290
290
291
+ let commit_failure_replica_ids: Vec < String > = participants
292
+ . iter ( )
293
+ . filter ( |p| p. commit_failures > 0 )
294
+ . map ( |p| p. replica_id . clone ( ) )
295
+ . collect ( ) ;
296
+
291
297
// only increment quorum ID if something about the quorum
292
298
// changed (members/addresses/etc)
293
299
if state. prev_quorum . is_none ( )
@@ -301,6 +307,13 @@ impl Lighthouse {
301
307
"Detected quorum change, bumping quorum_id to {}" ,
302
308
state. quorum_id
303
309
) ;
310
+ } else if commit_failure_replica_ids. len ( ) > 0 {
311
+ state. quorum_id += 1 ;
312
+ info ! (
313
+ "Detected commit failures in [{}], bumping quorum_id to {}" ,
314
+ commit_failure_replica_ids. join( ", " ) ,
315
+ state. quorum_id
316
+ ) ;
304
317
}
305
318
306
319
let quorum = Quorum {
@@ -639,6 +652,7 @@ mod tests {
639
652
world_size : 1 ,
640
653
shrink_only : false ,
641
654
data : String :: new ( ) ,
655
+ commit_failures : 0 ,
642
656
} ,
643
657
} ,
644
658
) ;
@@ -656,6 +670,7 @@ mod tests {
656
670
world_size : 1 ,
657
671
shrink_only : false ,
658
672
data : String :: new ( ) ,
673
+ commit_failures : 0 ,
659
674
} ,
660
675
} ,
661
676
) ;
@@ -712,6 +727,7 @@ mod tests {
712
727
world_size : 1 ,
713
728
shrink_only : false ,
714
729
data : String :: new ( ) ,
730
+ commit_failures : 0 ,
715
731
} ,
716
732
} ,
717
733
) ;
@@ -751,6 +767,7 @@ mod tests {
751
767
world_size : 1 ,
752
768
shrink_only : false ,
753
769
data : String :: new ( ) ,
770
+ commit_failures : 0 ,
754
771
} ,
755
772
} ,
756
773
) ;
@@ -798,6 +815,7 @@ mod tests {
798
815
world_size : 1 ,
799
816
shrink_only : false ,
800
817
data : String :: new ( ) ,
818
+ commit_failures : 0 ,
801
819
} ,
802
820
} ,
803
821
) ;
@@ -819,6 +837,7 @@ mod tests {
819
837
world_size: 1 ,
820
838
shrink_only: false ,
821
839
data: String :: new( ) ,
840
+ commit_failures: 0 ,
822
841
} ] ,
823
842
created : Some ( SystemTime :: now ( ) . into ( ) ) ,
824
843
} ) ;
@@ -838,6 +857,7 @@ mod tests {
838
857
world_size : 1 ,
839
858
shrink_only : false ,
840
859
data : String :: new ( ) ,
860
+ commit_failures : 0 ,
841
861
} ,
842
862
} ,
843
863
) ;
@@ -882,6 +902,7 @@ mod tests {
882
902
world_size: 1 ,
883
903
shrink_only: false ,
884
904
data: String :: new( ) ,
905
+ commit_failures: 0 ,
885
906
} ,
886
907
QuorumMember {
887
908
replica_id: "b" . to_string( ) ,
@@ -891,6 +912,7 @@ mod tests {
891
912
world_size: 1 ,
892
913
shrink_only: false ,
893
914
data: String :: new( ) ,
915
+ commit_failures: 0 ,
894
916
} ,
895
917
] ,
896
918
created : Some ( SystemTime :: now ( ) . into ( ) ) ,
@@ -908,6 +930,7 @@ mod tests {
908
930
world_size : 1 ,
909
931
shrink_only : true ,
910
932
data : String :: new ( ) ,
933
+ commit_failures : 0 ,
911
934
} ,
912
935
} ,
913
936
) ;
@@ -926,6 +949,7 @@ mod tests {
926
949
world_size : 1 ,
927
950
shrink_only : true ,
928
951
data : String :: new ( ) ,
952
+ commit_failures : 0 ,
929
953
} ,
930
954
} ,
931
955
) ;
@@ -975,6 +999,7 @@ mod tests {
975
999
world_size : 1 ,
976
1000
shrink_only : false ,
977
1001
data : String :: new ( ) ,
1002
+ commit_failures : 0 ,
978
1003
} ) ,
979
1004
} ) ;
980
1005
@@ -1021,6 +1046,7 @@ mod tests {
1021
1046
world_size : 1 ,
1022
1047
shrink_only : false ,
1023
1048
data : String :: new ( ) ,
1049
+ commit_failures : 0 ,
1024
1050
} ,
1025
1051
} ,
1026
1052
) ;
@@ -1047,6 +1073,7 @@ mod tests {
1047
1073
world_size: 1 ,
1048
1074
shrink_only: false ,
1049
1075
data: String :: new( ) ,
1076
+ commit_failures: 0 ,
1050
1077
} ] ;
1051
1078
let b = vec ! [ QuorumMember {
1052
1079
replica_id: "1" . to_string( ) ,
@@ -1056,6 +1083,7 @@ mod tests {
1056
1083
world_size: 1 ,
1057
1084
shrink_only: false ,
1058
1085
data: String :: new( ) ,
1086
+ commit_failures: 0 ,
1059
1087
} ] ;
1060
1088
1061
1089
// replica_id is the same
@@ -1069,12 +1097,13 @@ mod tests {
1069
1097
world_size: 1 ,
1070
1098
shrink_only: false ,
1071
1099
data: String :: new( ) ,
1100
+ commit_failures: 0 ,
1072
1101
} ] ;
1073
1102
// replica_id changed
1074
1103
assert ! ( quorum_changed( & a, & c) ) ;
1075
1104
}
1076
- #[ tokio:: test]
1077
1105
1106
+ #[ tokio:: test]
1078
1107
async fn test_lighthouse_join_during_shrink ( ) -> Result < ( ) > {
1079
1108
fn create_member ( id : & str , addr_num : & str , step : i64 , shrink_only : bool ) -> QuorumMember {
1080
1109
QuorumMember {
@@ -1085,6 +1114,7 @@ mod tests {
1085
1114
world_size : 1 ,
1086
1115
shrink_only,
1087
1116
data : String :: new ( ) ,
1117
+ commit_failures : 0 ,
1088
1118
}
1089
1119
}
1090
1120
@@ -1179,4 +1209,76 @@ mod tests {
1179
1209
lighthouse_task. abort ( ) ;
1180
1210
Ok ( ( ) )
1181
1211
}
1212
+
1213
+ #[ tokio:: test]
1214
+ async fn test_lighthouse_commit_failures ( ) -> Result < ( ) > {
1215
+ fn create_member ( id : & str , commit_failures : i64 ) -> QuorumMember {
1216
+ QuorumMember {
1217
+ replica_id : id. to_string ( ) ,
1218
+ address : format ! ( "addr{}" , id) ,
1219
+ store_address : format ! ( "store{}" , id) ,
1220
+ step : 10 ,
1221
+ world_size : 1 ,
1222
+ shrink_only : false ,
1223
+ data : String :: new ( ) ,
1224
+ commit_failures,
1225
+ }
1226
+ }
1227
+
1228
+ fn create_request ( member : & QuorumMember ) -> tonic:: Request < LighthouseQuorumRequest > {
1229
+ tonic:: Request :: new ( LighthouseQuorumRequest {
1230
+ requester : Some ( member. clone ( ) ) ,
1231
+ } )
1232
+ }
1233
+
1234
+ let opt = LighthouseOpt {
1235
+ min_replicas : 2 ,
1236
+ bind : "[::]:0" . to_string ( ) ,
1237
+ join_timeout_ms : 1000 ,
1238
+ quorum_tick_ms : 10 ,
1239
+ heartbeat_timeout_ms : 5000 ,
1240
+ } ;
1241
+
1242
+ // Start the lighthouse service
1243
+ let lighthouse = Lighthouse :: new ( opt) . await ?;
1244
+ let lighthouse_task = tokio:: spawn ( lighthouse. clone ( ) . run ( ) ) ;
1245
+
1246
+ // Create client to interact with lighthouse
1247
+ let mut client = lighthouse_client_new ( lighthouse. address ( ) ) . await ?;
1248
+
1249
+ // First two quorums should be stable
1250
+ for _i in 0 ..2 {
1251
+ let first_request = create_request ( & create_member ( "replica0" , 0 ) ) ;
1252
+ let second_request = create_request ( & create_member ( "replica1" , 0 ) ) ;
1253
+
1254
+ tokio:: spawn ( {
1255
+ let mut client = client. clone ( ) ;
1256
+ async move { client. quorum ( first_request) . await }
1257
+ } ) ;
1258
+ let first_response = client. quorum ( second_request) . await ?;
1259
+ let first_quorum = first_response. into_inner ( ) . quorum . unwrap ( ) ;
1260
+ assert_eq ! ( first_quorum. quorum_id, 1 ) ;
1261
+ assert_eq ! ( first_quorum. participants. len( ) , 2 ) ;
1262
+ assert_eq ! ( first_quorum. participants[ 0 ] . commit_failures, 0 ) ;
1263
+ assert_eq ! ( first_quorum. participants[ 1 ] . commit_failures, 0 ) ;
1264
+ }
1265
+
1266
+ // commit_failures should increment quorum_id
1267
+ let first_request = create_request ( & create_member ( "replica0" , 0 ) ) ;
1268
+ let second_request = create_request ( & create_member ( "replica1" , 2 ) ) ;
1269
+
1270
+ tokio:: spawn ( {
1271
+ let mut client = client. clone ( ) ;
1272
+ async move { client. quorum ( first_request) . await }
1273
+ } ) ;
1274
+ let first_response = client. quorum ( second_request) . await ?;
1275
+ let first_quorum = first_response. into_inner ( ) . quorum . unwrap ( ) ;
1276
+ assert_eq ! ( first_quorum. quorum_id, 2 ) ;
1277
+ assert_eq ! ( first_quorum. participants. len( ) , 2 ) ;
1278
+ assert_eq ! ( first_quorum. participants[ 0 ] . commit_failures, 0 ) ;
1279
+ assert_eq ! ( first_quorum. participants[ 1 ] . commit_failures, 2 ) ;
1280
+
1281
+ lighthouse_task. abort ( ) ;
1282
+ Ok ( ( ) )
1283
+ }
1182
1284
}
0 commit comments