Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit cb4e410

Browse files
committed
Make sure to root local slots even with hard fork
1 parent bf9ca98 commit cb4e410

File tree

9 files changed

+485
-116
lines changed

9 files changed

+485
-116
lines changed

core/src/consensus.rs

Lines changed: 55 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,7 +1072,7 @@ impl Tower {
10721072
if let Some(last_voted_slot) = self.last_voted_slot() {
10731073
if tower_root <= replayed_root {
10741074
// Normally, we goes into this clause with possible help of
1075-
// reconcile_blockstore_roots_with_tower()
1075+
// reconcile_blockstore_roots_with_external_source()
10761076
if slot_history.check(last_voted_slot) == Check::TooOld {
10771077
// We could try hard to anchor with other older votes, but opt to simplify the
10781078
// following logic
@@ -1320,45 +1320,63 @@ impl TowerError {
13201320
}
13211321
}
13221322

1323+
#[derive(Debug)]
1324+
pub enum ExternalRootSource {
1325+
Tower(Slot),
1326+
HardFork(Slot),
1327+
}
1328+
1329+
impl ExternalRootSource {
1330+
fn root(&self) -> Slot {
1331+
match self {
1332+
ExternalRootSource::Tower(slot) => *slot,
1333+
ExternalRootSource::HardFork(slot) => *slot,
1334+
}
1335+
}
1336+
}
1337+
13231338
// Given an untimely crash, tower may have roots that are not reflected in blockstore,
13241339
// or the reverse of this.
13251340
// That's because we don't impose any ordering guarantee or any kind of write barriers
13261341
// between tower (plain old POSIX fs calls) and blockstore (through RocksDB), when
13271342
// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots.
1328-
pub fn reconcile_blockstore_roots_with_tower(
1329-
tower: &Tower,
1343+
pub fn reconcile_blockstore_roots_with_external_source(
1344+
external_source: ExternalRootSource,
13301345
blockstore: &Blockstore,
1346+
last_blockstore_root: &mut Slot,
13311347
) -> blockstore_db::Result<()> {
1332-
let tower_root = tower.root();
1333-
let last_blockstore_root = blockstore.last_root();
1334-
if last_blockstore_root < tower_root {
1335-
// Ensure tower_root itself to exist and be marked as rooted in the blockstore
1348+
let external_root = external_source.root();
1349+
if *last_blockstore_root < external_root {
1350+
// Ensure external_root itself to exist and be marked as rooted in the blockstore
13361351
// in addition to its ancestors.
1337-
let new_roots: Vec<_> = AncestorIterator::new_inclusive(tower_root, blockstore)
1338-
.take_while(|current| match current.cmp(&last_blockstore_root) {
1352+
let new_roots: Vec<_> = AncestorIterator::new_inclusive(external_root, blockstore)
1353+
.take_while(|current| match current.cmp(last_blockstore_root) {
13391354
Ordering::Greater => true,
13401355
Ordering::Equal => false,
13411356
Ordering::Less => panic!(
1342-
"couldn't find a last_blockstore_root upwards from: {}!?",
1343-
tower_root
1357+
"last_blockstore_root({}) is skipped while traversing blockstore (currently at {}) from external root ({})!?",
1358+
last_blockstore_root,
1359+
current,
1360+
external_root
13441361
),
13451362
})
13461363
.collect();
13471364
if !new_roots.is_empty() {
13481365
info!(
1349-
"Reconciling slots as root based on tower root: {:?} ({}..{}) ",
1350-
new_roots, tower_root, last_blockstore_root
1366+
"Reconciling slots as root based on external root: {:?} (external: {:?}, blockstore: {})",
1367+
new_roots, external_source, last_blockstore_root
13511368
);
13521369
blockstore.set_roots(new_roots.iter())?;
1370+
*last_blockstore_root = blockstore.last_root();
13531371
} else {
13541372
// This indicates we're in bad state; but still don't panic here.
13551373
// That's because we might have a chance of recovering properly with
13561374
// newer snapshot.
13571375
warn!(
1358-
"Couldn't find any ancestor slots from tower root ({}) \
1376+
"Couldn't find any ancestor slots from external source ({:?}) \
13591377
towards blockstore root ({}); blockstore pruned or only \
1360-
tower moved into new ledger?",
1361-
tower_root, last_blockstore_root,
1378+
tower moved into new ledger or just hard fork?",
1379+
external_source, last_blockstore_root,
13621380
);
13631381
}
13641382
}
@@ -2814,7 +2832,12 @@ pub mod test {
28142832

28152833
let mut tower = Tower::default();
28162834
tower.vote_state.root_slot = Some(4);
2817-
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2835+
reconcile_blockstore_roots_with_external_source(
2836+
ExternalRootSource::Tower(tower.root()),
2837+
&blockstore,
2838+
&mut blockstore.last_root(),
2839+
)
2840+
.unwrap();
28182841

28192842
assert!(!blockstore.is_root(0));
28202843
assert!(blockstore.is_root(1));
@@ -2825,7 +2848,9 @@ pub mod test {
28252848
}
28262849

28272850
#[test]
2828-
#[should_panic(expected = "couldn't find a last_blockstore_root upwards from: 4!?")]
2851+
#[should_panic(
2852+
expected = "last_blockstore_root(3) is skipped while traversing blockstore (currently at 1) from external root (4)!?"
2853+
)]
28292854
fn test_reconcile_blockstore_roots_with_tower_panic_no_common_root() {
28302855
solana_logger::setup();
28312856
let blockstore_path = get_tmp_ledger_path!();
@@ -2846,7 +2871,12 @@ pub mod test {
28462871

28472872
let mut tower = Tower::default();
28482873
tower.vote_state.root_slot = Some(4);
2849-
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2874+
reconcile_blockstore_roots_with_external_source(
2875+
ExternalRootSource::Tower(tower.root()),
2876+
&blockstore,
2877+
&mut blockstore.last_root(),
2878+
)
2879+
.unwrap();
28502880
}
28512881
Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction");
28522882
}
@@ -2869,7 +2899,12 @@ pub mod test {
28692899
let mut tower = Tower::default();
28702900
tower.vote_state.root_slot = Some(4);
28712901
assert_eq!(blockstore.last_root(), 0);
2872-
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2902+
reconcile_blockstore_roots_with_external_source(
2903+
ExternalRootSource::Tower(tower.root()),
2904+
&blockstore,
2905+
&mut blockstore.last_root(),
2906+
)
2907+
.unwrap();
28732908
assert_eq!(blockstore.last_root(), 0);
28742909
}
28752910
Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction");

core/src/validator.rs

Lines changed: 65 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use {
88
cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService},
99
cluster_info_vote_listener::VoteTracker,
1010
completed_data_sets_service::CompletedDataSetsService,
11-
consensus::{reconcile_blockstore_roots_with_tower, Tower},
11+
consensus::{reconcile_blockstore_roots_with_external_source, ExternalRootSource, Tower},
1212
ledger_metric_report_service::LedgerMetricReportService,
1313
poh_timing_report_service::PohTimingReportService,
1414
rewards_recorder_service::{RewardsRecorderSender, RewardsRecorderService},
@@ -512,6 +512,7 @@ impl Validator {
512512
genesis_config,
513513
bank_forks,
514514
blockstore,
515+
original_blockstore_root,
515516
ledger_signal_receiver,
516517
completed_slots_receiver,
517518
leader_schedule_cache,
@@ -667,6 +668,7 @@ impl Validator {
667668
vote_account,
668669
&start_progress,
669670
&blockstore,
671+
original_blockstore_root,
670672
&bank_forks,
671673
&leader_schedule_cache,
672674
&blockstore_process_options,
@@ -1224,6 +1226,17 @@ fn check_poh_speed(genesis_config: &GenesisConfig, maybe_hash_samples: Option<u6
12241226
}
12251227
}
12261228

1229+
fn maybe_cluster_restart_with_hard_fork(config: &ValidatorConfig, root_slot: Slot) -> Option<Slot> {
1230+
// detect cluster restart (hard fork) indirectly via wait_for_supermajority...
1231+
if let Some(wait_slot_for_supermajority) = config.wait_for_supermajority {
1232+
if wait_slot_for_supermajority == root_slot {
1233+
return Some(wait_slot_for_supermajority);
1234+
}
1235+
}
1236+
1237+
None
1238+
}
1239+
12271240
fn post_process_restored_tower(
12281241
restored_tower: crate::consensus::Result<Tower>,
12291242
validator_identity: &Pubkey,
@@ -1237,29 +1250,28 @@ fn post_process_restored_tower(
12371250
.and_then(|tower| {
12381251
let root_bank = bank_forks.root_bank();
12391252
let slot_history = root_bank.get_slot_history();
1253+
// make sure tower isn't corrupted first before the following hard fork check
12401254
let tower = tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history);
12411255

1242-
if let Some(wait_slot_for_supermajority) = config.wait_for_supermajority {
1243-
if root_bank.slot() == wait_slot_for_supermajority {
1244-
// intentionally fail to restore tower; we're supposedly in a new hard fork; past
1245-
// out-of-chain vote state doesn't make sense at all
1246-
// what if --wait-for-supermajority again if the validator restarted?
1247-
let message = format!("Hardfork is detected; discarding tower restoration result: {:?}", tower);
1248-
datapoint_error!(
1249-
"tower_error",
1250-
(
1251-
"error",
1252-
message,
1253-
String
1254-
),
1255-
);
1256-
error!("{}", message);
1256+
if let Some(hard_fork_restart_slot) = maybe_cluster_restart_with_hard_fork(config, root_bank.slot()) {
1257+
// intentionally fail to restore tower; we're supposedly in a new hard fork; past
1258+
// out-of-chain vote state doesn't make sense at all
1259+
// what if --wait-for-supermajority again if the validator restarted?
1260+
let message = format!("Hard fork is detected; discarding tower restoration result: {:?}", tower);
1261+
datapoint_error!(
1262+
"tower_error",
1263+
(
1264+
"error",
1265+
message,
1266+
String
1267+
),
1268+
);
1269+
error!("{}", message);
12571270

1258-
// unconditionally relax tower requirement so that we can always restore tower
1259-
// from root bank.
1260-
should_require_tower = false;
1261-
return Err(crate::consensus::TowerError::HardFork(wait_slot_for_supermajority));
1262-
}
1271+
// unconditionally relax tower requirement so that we can always restore tower
1272+
// from root bank.
1273+
should_require_tower = false;
1274+
return Err(crate::consensus::TowerError::HardFork(hard_fork_restart_slot));
12631275
}
12641276

12651277
if let Some(warp_slot) = config.warp_slot {
@@ -1326,6 +1338,7 @@ fn load_blockstore(
13261338
GenesisConfig,
13271339
Arc<RwLock<BankForks>>,
13281340
Arc<Blockstore>,
1341+
Slot,
13291342
Receiver<bool>,
13301343
CompletedSlotsReceiver,
13311344
LeaderScheduleCache,
@@ -1378,6 +1391,9 @@ fn load_blockstore(
13781391
.expect("Failed to open ledger database");
13791392
blockstore.set_no_compaction(config.no_rocksdb_compaction);
13801393
blockstore.shred_timing_point_sender = poh_timing_point_sender;
1394+
// following boot sequence (esp BankForks) could set root. so stash the original value
1395+
// of blockstore root away here as soon as possible.
1396+
let original_blockstore_root = blockstore.last_root();
13811397

13821398
let blockstore = Arc::new(blockstore);
13831399
let blockstore_root_scan = BlockstoreRootScan::new(config, &blockstore, exit);
@@ -1482,6 +1498,7 @@ fn load_blockstore(
14821498
genesis_config,
14831499
bank_forks,
14841500
blockstore,
1501+
original_blockstore_root,
14851502
ledger_signal_receiver,
14861503
completed_slots_receiver,
14871504
leader_schedule_cache,
@@ -1521,6 +1538,7 @@ struct ProcessBlockStore<'a> {
15211538
vote_account: &'a Pubkey,
15221539
start_progress: &'a Arc<RwLock<ValidatorStartProgress>>,
15231540
blockstore: &'a Blockstore,
1541+
original_blockstore_root: Slot,
15241542
bank_forks: &'a Arc<RwLock<BankForks>>,
15251543
leader_schedule_cache: &'a LeaderScheduleCache,
15261544
process_options: &'a blockstore_processor::ProcessOptions,
@@ -1539,6 +1557,7 @@ impl<'a> ProcessBlockStore<'a> {
15391557
vote_account: &'a Pubkey,
15401558
start_progress: &'a Arc<RwLock<ValidatorStartProgress>>,
15411559
blockstore: &'a Blockstore,
1560+
original_blockstore_root: Slot,
15421561
bank_forks: &'a Arc<RwLock<BankForks>>,
15431562
leader_schedule_cache: &'a LeaderScheduleCache,
15441563
process_options: &'a blockstore_processor::ProcessOptions,
@@ -1553,6 +1572,7 @@ impl<'a> ProcessBlockStore<'a> {
15531572
vote_account,
15541573
start_progress,
15551574
blockstore,
1575+
original_blockstore_root,
15561576
bank_forks,
15571577
leader_schedule_cache,
15581578
process_options,
@@ -1622,12 +1642,30 @@ impl<'a> ProcessBlockStore<'a> {
16221642
self.tower = Some({
16231643
let restored_tower = Tower::restore(self.config.tower_storage.as_ref(), self.id);
16241644
if let Ok(tower) = &restored_tower {
1625-
reconcile_blockstore_roots_with_tower(tower, self.blockstore).unwrap_or_else(
1626-
|err| {
1627-
error!("Failed to reconcile blockstore with tower: {:?}", err);
1628-
abort()
1629-
},
1630-
);
1645+
reconcile_blockstore_roots_with_external_source(
1646+
ExternalRootSource::Tower(tower.root()),
1647+
self.blockstore,
1648+
&mut self.original_blockstore_root,
1649+
)
1650+
.unwrap_or_else(|err| {
1651+
error!("Failed to reconcile blockstore with tower: {:?}", err);
1652+
abort()
1653+
});
1654+
}
1655+
1656+
if let Some(hard_fork_restart_slot) = maybe_cluster_restart_with_hard_fork(
1657+
self.config,
1658+
self.bank_forks.read().unwrap().root_bank().slot(),
1659+
) {
1660+
reconcile_blockstore_roots_with_external_source(
1661+
ExternalRootSource::HardFork(hard_fork_restart_slot),
1662+
self.blockstore,
1663+
&mut self.original_blockstore_root,
1664+
)
1665+
.unwrap_or_else(|err| {
1666+
error!("Failed to reconcile blockstore with hard fork: {:?}", err);
1667+
abort()
1668+
});
16311669
}
16321670

16331671
post_process_restored_tower(

ledger/src/blockstore.rs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -523,13 +523,28 @@ impl Blockstore {
523523
Ok(slot_iterator.take_while(move |((shred_slot, _), _)| *shred_slot == slot))
524524
}
525525

526-
pub fn rooted_slot_iterator(&self, slot: Slot) -> Result<impl Iterator<Item = u64> + '_> {
526+
fn prepare_rooted_slot_iterator(
527+
&self,
528+
slot: Slot,
529+
direction: IteratorDirection,
530+
) -> Result<impl Iterator<Item = Slot> + '_> {
527531
let slot_iterator = self
528532
.db
529-
.iter::<cf::Root>(IteratorMode::From(slot, IteratorDirection::Forward))?;
533+
.iter::<cf::Root>(IteratorMode::From(slot, direction))?;
530534
Ok(slot_iterator.map(move |(rooted_slot, _)| rooted_slot))
531535
}
532536

537+
pub fn rooted_slot_iterator(&self, slot: Slot) -> Result<impl Iterator<Item = Slot> + '_> {
538+
self.prepare_rooted_slot_iterator(slot, IteratorDirection::Forward)
539+
}
540+
541+
pub fn reversed_rooted_slot_iterator(
542+
&self,
543+
slot: Slot,
544+
) -> Result<impl Iterator<Item = Slot> + '_> {
545+
self.prepare_rooted_slot_iterator(slot, IteratorDirection::Reverse)
546+
}
547+
533548
/// Determines if starting_slot and ending_slot are connected
534549
pub fn slots_connected(&self, starting_slot: Slot, ending_slot: Slot) -> bool {
535550
let mut next_slots: VecDeque<_> = vec![starting_slot].into();
@@ -1725,6 +1740,13 @@ impl Blockstore {
17251740
self.meta_cf.put_bytes(slot, bytes)
17261741
}
17271742

1743+
/// Manually update the meta for a slot.
1744+
/// Can interfere with automatic meta update and potentially break chaining.
1745+
/// Dangerous. Use with care.
1746+
pub fn put_meta(&self, slot: Slot, meta: &SlotMeta) -> Result<()> {
1747+
self.put_meta_bytes(slot, &bincode::serialize(meta)?)
1748+
}
1749+
17281750
// Given a start and end entry index, find all the missing
17291751
// indexes in the ledger in the range [start_index, end_index)
17301752
// for the slot with the specified slot

ledger/src/blockstore_meta.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ impl SlotMeta {
214214
Some(self.consumed) == self.last_index.map(|ix| ix + 1)
215215
}
216216

217+
pub fn unset_parent(&mut self) {
218+
self.parent_slot = None;
219+
}
220+
217221
pub fn clear_unconfirmed_slot(&mut self) {
218222
let mut new_self = SlotMeta::new_orphan(self.slot);
219223
std::mem::swap(&mut new_self.next_slots, &mut self.next_slots);

0 commit comments

Comments
 (0)