Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit 07955e7

Browse files
authored
replay: gracefully exit if tower load fails (#35269)
1 parent 60ccdb3 commit 07955e7

File tree

1 file changed

+64
-33
lines changed

1 file changed

+64
-33
lines changed

core/src/replay_stage.rs

+64-33
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use {
1616
progress_map::{ForkProgress, ProgressMap, PropagatedStats, ReplaySlotStats},
1717
tower_storage::{SavedTower, SavedTowerVersions, TowerStorage},
1818
BlockhashStatus, ComputedBankState, Stake, SwitchForkDecision, ThresholdDecision,
19-
Tower, VotedStakes, SWITCH_FORK_THRESHOLD,
19+
Tower, TowerError, VotedStakes, SWITCH_FORK_THRESHOLD,
2020
},
2121
cost_update_service::CostUpdate,
2222
repair::{
@@ -580,12 +580,25 @@ impl ReplayStage {
580580
// set-identity was called during the startup procedure, ensure the tower is consistent
581581
// before starting the loop. further calls to set-identity will reload the tower in the loop
582582
let my_old_pubkey = tower.node_pubkey;
583-
tower = Self::load_tower(
583+
tower = match Self::load_tower(
584584
tower_storage.as_ref(),
585585
&my_pubkey,
586586
&vote_account,
587587
&bank_forks,
588-
);
588+
) {
589+
Ok(tower) => tower,
590+
Err(err) => {
591+
error!(
592+
"Unable to load new tower when attempting to change identity from {} to {} on
593+
ReplayStage startup, Exiting: {}",
594+
my_old_pubkey,
595+
my_pubkey,
596+
err
597+
);
598+
// drop(_exit) will set the exit flag, eventually tearing down the entire process
599+
return;
600+
}
601+
};
589602
warn!(
590603
"Identity changed during startup from {} to {}",
591604
my_old_pubkey, my_pubkey
@@ -997,12 +1010,25 @@ impl ReplayStage {
9971010
my_pubkey = identity_keypair.pubkey();
9981011

9991012
// Load the new identity's tower
1000-
tower = Self::load_tower(
1013+
tower = match Self::load_tower(
10011014
tower_storage.as_ref(),
10021015
&my_pubkey,
10031016
&vote_account,
10041017
&bank_forks,
1005-
);
1018+
) {
1019+
Ok(tower) => tower,
1020+
Err(err) => {
1021+
error!(
1022+
"Unable to load new tower when attempting to change identity
1023+
from {} to {} on set-identity, Exiting: {}",
1024+
my_old_pubkey,
1025+
my_pubkey,
1026+
err
1027+
);
1028+
// drop(_exit) will set the exit flag, eventually tearing down the entire process
1029+
return;
1030+
}
1031+
};
10061032
// Ensure the validator can land votes with the new identity before
10071033
// becoming leader
10081034
has_new_vote_been_rooted = !wait_for_vote_to_start_leader;
@@ -1152,37 +1178,40 @@ impl ReplayStage {
11521178
})
11531179
}
11541180

1181+
/// Loads the tower from `tower_storage` with identity `node_pubkey`.
1182+
///
1183+
/// If the tower is missing or too old, a tower is constructed from bank forks.
11551184
fn load_tower(
11561185
tower_storage: &dyn TowerStorage,
11571186
node_pubkey: &Pubkey,
11581187
vote_account: &Pubkey,
11591188
bank_forks: &Arc<RwLock<BankForks>>,
1160-
) -> Tower {
1161-
Tower::restore(tower_storage, node_pubkey)
1162-
.and_then(|restored_tower| {
1163-
let root_bank = bank_forks.read().unwrap().root_bank();
1164-
let slot_history = root_bank.get_slot_history();
1165-
restored_tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history)
1166-
})
1167-
.unwrap_or_else(|err| {
1168-
if err.is_file_missing() {
1169-
Tower::new_from_bankforks(
1170-
&bank_forks.read().unwrap(),
1171-
node_pubkey,
1172-
vote_account,
1173-
)
1174-
} else if err.is_too_old() {
1175-
warn!("Failed to load tower, too old for {}: {}. Creating a new tower from bankforks.", node_pubkey, err);
1176-
Tower::new_from_bankforks(
1177-
&bank_forks.read().unwrap(),
1178-
node_pubkey,
1179-
vote_account,
1180-
)
1181-
} else {
1182-
error!("Failed to load tower for {}: {}", node_pubkey, err);
1183-
std::process::exit(1);
1184-
}
1185-
})
1189+
) -> Result<Tower, TowerError> {
1190+
let tower = Tower::restore(tower_storage, node_pubkey).and_then(|restored_tower| {
1191+
let root_bank = bank_forks.read().unwrap().root_bank();
1192+
let slot_history = root_bank.get_slot_history();
1193+
restored_tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history)
1194+
});
1195+
match tower {
1196+
Ok(tower) => Ok(tower),
1197+
Err(err) if err.is_file_missing() => {
1198+
warn!("Failed to load tower, file missing for {}: {}. Creating a new tower from bankforks.", node_pubkey, err);
1199+
Ok(Tower::new_from_bankforks(
1200+
&bank_forks.read().unwrap(),
1201+
node_pubkey,
1202+
vote_account,
1203+
))
1204+
}
1205+
Err(err) if err.is_too_old() => {
1206+
warn!("Failed to load tower, too old for {}: {}. Creating a new tower from bankforks.", node_pubkey, err);
1207+
Ok(Tower::new_from_bankforks(
1208+
&bank_forks.read().unwrap(),
1209+
node_pubkey,
1210+
vote_account,
1211+
))
1212+
}
1213+
Err(err) => Err(err),
1214+
}
11861215
}
11871216

11881217
fn check_for_vote_only_mode(
@@ -8643,7 +8672,8 @@ pub(crate) mod tests {
86438672
let bank_forks = vote_simulator.bank_forks;
86448673

86458674
let tower =
8646-
ReplayStage::load_tower(&tower_storage, &node_pubkey, &vote_account, &bank_forks);
8675+
ReplayStage::load_tower(&tower_storage, &node_pubkey, &vote_account, &bank_forks)
8676+
.unwrap();
86478677
let expected_tower = Tower::new_for_tests(VOTE_THRESHOLD_DEPTH, VOTE_THRESHOLD_SIZE);
86488678
assert_eq!(tower.vote_state, expected_tower.vote_state);
86498679
assert_eq!(tower.node_pubkey, node_pubkey);
@@ -8670,7 +8700,8 @@ pub(crate) mod tests {
86708700
expected_tower.save(&tower_storage, &node_keypair).unwrap();
86718701

86728702
let tower =
8673-
ReplayStage::load_tower(&tower_storage, &node_pubkey, &vote_account, &bank_forks);
8703+
ReplayStage::load_tower(&tower_storage, &node_pubkey, &vote_account, &bank_forks)
8704+
.unwrap();
86748705
assert_eq!(tower.vote_state, expected_tower.vote_state);
86758706
assert_eq!(tower.node_pubkey, expected_tower.node_pubkey);
86768707
}

0 commit comments

Comments
 (0)