Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit f616d28

Browse files
committed
Make sure to root local slots even with hard fork
1 parent 7b365c5 commit f616d28

File tree

6 files changed

+411
-40
lines changed

6 files changed

+411
-40
lines changed

core/src/consensus.rs

Lines changed: 49 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,7 @@ impl Tower {
969969
if let Some(last_voted_slot) = self.last_voted_slot() {
970970
if tower_root <= replayed_root {
971971
// Normally, we goes into this clause with possible help of
972-
// reconcile_blockstore_roots_with_tower()
972+
// reconcile_blockstore_roots_with_external_source()
973973
if slot_history.check(last_voted_slot) == Check::TooOld {
974974
// We could try hard to anchor with other older votes, but opt to simplify the
975975
// following logic
@@ -1221,45 +1221,61 @@ impl TowerError {
12211221
}
12221222
}
12231223

1224+
#[derive(Debug)]
1225+
pub enum ExternalRootSource {
1226+
Tower(Slot),
1227+
HardFork(Slot),
1228+
}
1229+
1230+
impl ExternalRootSource {
1231+
fn root(&self) -> Slot {
1232+
match self {
1233+
ExternalRootSource::Tower(slot) => *slot,
1234+
ExternalRootSource::HardFork(slot) => *slot,
1235+
}
1236+
}
1237+
}
1238+
12241239
// Given an untimely crash, tower may have roots that are not reflected in blockstore,
12251240
// or the reverse of this.
12261241
// That's because we don't impose any ordering guarantee or any kind of write barriers
12271242
// between tower (plain old POSIX fs calls) and blockstore (through RocksDB), when
12281243
// `ReplayState::handle_votable_bank()` saves tower before setting blockstore roots.
1229-
pub fn reconcile_blockstore_roots_with_tower(
1230-
tower: &Tower,
1244+
pub fn reconcile_blockstore_roots_with_external_source(
1245+
external_source: ExternalRootSource,
12311246
blockstore: &Blockstore,
1247+
last_blockstore_root: &mut Slot,
12321248
) -> blockstore_db::Result<()> {
1233-
let tower_root = tower.root();
1234-
let last_blockstore_root = blockstore.last_root();
1235-
if last_blockstore_root < tower_root {
1236-
// Ensure tower_root itself to exist and be marked as rooted in the blockstore
1249+
let external_root = external_source.root();
1250+
if *last_blockstore_root < external_root {
1251+
// Ensure external_root itself to exist and be marked as rooted in the blockstore
12371252
// in addition to its ancestors.
1238-
let new_roots: Vec<_> = AncestorIterator::new_inclusive(tower_root, blockstore)
1239-
.take_while(|current| match current.cmp(&last_blockstore_root) {
1253+
let new_roots: Vec<_> = AncestorIterator::new_inclusive(external_root, blockstore)
1254+
.take_while(|current| match current.cmp(last_blockstore_root) {
12401255
Ordering::Greater => true,
12411256
Ordering::Equal => false,
12421257
Ordering::Less => panic!(
12431258
"couldn't find a last_blockstore_root upwards from: {}!?",
1244-
tower_root
1259+
external_root
12451260
),
12461261
})
12471262
.collect();
12481263
if !new_roots.is_empty() {
12491264
info!(
1250-
"Reconciling slots as root based on tower root: {:?} ({}..{}) ",
1251-
new_roots, tower_root, last_blockstore_root
1265+
"Reconciling slots as root based on external root: {:?} ({}..{}) ",
1266+
new_roots, external_root, last_blockstore_root
12521267
);
12531268
blockstore.set_roots(new_roots.iter())?;
1269+
*last_blockstore_root = blockstore.last_root();
12541270
} else {
12551271
// This indicates we're in bad state; but still don't panic here.
12561272
// That's because we might have a chance of recovering properly with
12571273
// newer snapshot.
12581274
warn!(
1259-
"Couldn't find any ancestor slots from tower root ({}) \
1275+
"Couldn't find any ancestor slots from external source ({:?}) \
12601276
towards blockstore root ({}); blockstore pruned or only \
1261-
tower moved into new ledger?",
1262-
tower_root, last_blockstore_root,
1277+
tower moved into new ledger or just hard fork?",
1278+
external_source, last_blockstore_root,
12631279
);
12641280
}
12651281
}
@@ -2737,7 +2753,12 @@ pub mod test {
27372753

27382754
let mut tower = Tower::default();
27392755
tower.vote_state.root_slot = Some(4);
2740-
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2756+
reconcile_blockstore_roots_with_external_source(
2757+
ExternalRootSource::Tower(tower.root()),
2758+
&blockstore,
2759+
&mut blockstore.last_root(),
2760+
)
2761+
.unwrap();
27412762

27422763
assert!(!blockstore.is_root(0));
27432764
assert!(blockstore.is_root(1));
@@ -2769,7 +2790,12 @@ pub mod test {
27692790

27702791
let mut tower = Tower::default();
27712792
tower.vote_state.root_slot = Some(4);
2772-
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2793+
reconcile_blockstore_roots_with_external_source(
2794+
ExternalRootSource::Tower(tower.root()),
2795+
&blockstore,
2796+
&mut blockstore.last_root(),
2797+
)
2798+
.unwrap();
27732799
}
27742800
Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction");
27752801
}
@@ -2792,7 +2818,12 @@ pub mod test {
27922818
let mut tower = Tower::default();
27932819
tower.vote_state.root_slot = Some(4);
27942820
assert_eq!(blockstore.last_root(), 0);
2795-
reconcile_blockstore_roots_with_tower(&tower, &blockstore).unwrap();
2821+
reconcile_blockstore_roots_with_external_source(
2822+
ExternalRootSource::Tower(tower.root()),
2823+
&blockstore,
2824+
&mut blockstore.last_root(),
2825+
)
2826+
.unwrap();
27962827
assert_eq!(blockstore.last_root(), 0);
27972828
}
27982829
Blockstore::destroy(&blockstore_path).expect("Expected successful database destruction");

core/src/validator.rs

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use {
66
cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService},
77
cluster_info_vote_listener::VoteTracker,
88
completed_data_sets_service::CompletedDataSetsService,
9-
consensus::{reconcile_blockstore_roots_with_tower, Tower},
9+
consensus::{reconcile_blockstore_roots_with_external_source, ExternalRootSource, Tower},
1010
cost_model::CostModel,
1111
rewards_recorder_service::{RewardsRecorderSender, RewardsRecorderService},
1212
sample_performance_service::SamplePerformanceService,
@@ -158,6 +158,7 @@ pub struct ValidatorConfig {
158158
pub validator_exit: Arc<RwLock<Exit>>,
159159
pub no_wait_for_vote_to_start_leader: bool,
160160
pub accounts_shrink_ratio: AccountShrinkThreshold,
161+
pub no_hard_fork_blockstore_root_reconcilation_for_local_cluster_test: bool,
161162
}
162163

163164
impl Default for ValidatorConfig {
@@ -217,6 +218,7 @@ impl Default for ValidatorConfig {
217218
no_wait_for_vote_to_start_leader: true,
218219
accounts_shrink_ratio: AccountShrinkThreshold::default(),
219220
accounts_db_config: None,
221+
no_hard_fork_blockstore_root_reconcilation_for_local_cluster_test: false,
220222
}
221223
}
222224
}
@@ -1005,14 +1007,16 @@ fn post_process_restored_tower(
10051007
.and_then(|tower| {
10061008
let root_bank = bank_forks.root_bank();
10071009
let slot_history = root_bank.get_slot_history();
1010+
// make sure tower isn't corrupted first before the following hard fork check
10081011
let tower = tower.adjust_lockouts_after_replay(root_bank.slot(), &slot_history);
10091012

1013+
// detect cluster-wide restart (hard fork) indirectly via wait_for_supermajority...
10101014
if let Some(wait_slot_for_supermajority) = config.wait_for_supermajority {
1011-
if root_bank.slot() == wait_slot_for_supermajority {
1015+
if wait_slot_for_supermajority == root_bank.slot() {
10121016
// intentionally fail to restore tower; we're supposedly in a new hard fork; past
10131017
// out-of-chain vote state doesn't make sense at all
10141018
// what if --wait-for-supermajority again if the validator restarted?
1015-
let message = format!("Hardfork is detected; discarding tower restoration result: {:?}", tower);
1019+
let message = format!("Hard fork is detected; discarding tower restoration result: {:?}", tower);
10161020
datapoint_error!(
10171021
"tower_error",
10181022
(
@@ -1143,11 +1147,22 @@ fn new_banks_from_ledger(
11431147
)
11441148
.expect("Failed to open ledger database");
11451149
blockstore.set_no_compaction(config.no_rocksdb_compaction);
1150+
// following boot sequence (esp BankForks) could set root. so stash the original value
1151+
// of blockstore root away here as soon as possible.
1152+
let mut last_blockstore_root = blockstore.last_root();
11461153

11471154
let restored_tower = Tower::restore(config.tower_storage.as_ref(), validator_identity);
11481155
if let Ok(tower) = &restored_tower {
1149-
reconcile_blockstore_roots_with_tower(tower, &blockstore).unwrap_or_else(|err| {
1150-
error!("Failed to reconcile blockstore with tower: {:?}", err);
1156+
reconcile_blockstore_roots_with_external_source(
1157+
ExternalRootSource::Tower(tower.root()),
1158+
&blockstore,
1159+
&mut last_blockstore_root,
1160+
)
1161+
.unwrap_or_else(|err| {
1162+
error!(
1163+
"Failed to reconcile blockstore according to tower: {:?}",
1164+
err
1165+
);
11511166
abort()
11521167
});
11531168
}
@@ -1266,6 +1281,24 @@ fn new_banks_from_ledger(
12661281
);
12671282
}
12681283

1284+
if let Some(wait_slot_for_supermajority) = config.wait_for_supermajority {
1285+
if wait_slot_for_supermajority == bank_forks.root_bank().slot()
1286+
&& !config.no_hard_fork_blockstore_root_reconcilation_for_local_cluster_test
1287+
{
1288+
reconcile_blockstore_roots_with_external_source(
1289+
ExternalRootSource::HardFork(wait_slot_for_supermajority),
1290+
&blockstore,
1291+
&mut last_blockstore_root,
1292+
)
1293+
.unwrap_or_else(|err| {
1294+
error!(
1295+
"Failed to reconcile blockstore according to hard fork: {:?}",
1296+
err
1297+
);
1298+
abort()
1299+
});
1300+
}
1301+
}
12691302
let tower = post_process_restored_tower(
12701303
restored_tower,
12711304
validator_identity,

ledger/src/blockstore.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -576,13 +576,28 @@ impl Blockstore {
576576
Ok(slot_iterator.take_while(move |((shred_slot, _), _)| *shred_slot == slot))
577577
}
578578

579-
pub fn rooted_slot_iterator(&self, slot: Slot) -> Result<impl Iterator<Item = u64> + '_> {
579+
fn prepare_rooted_slot_iterator(
580+
&self,
581+
slot: Slot,
582+
direction: IteratorDirection,
583+
) -> Result<impl Iterator<Item = Slot> + '_> {
580584
let slot_iterator = self
581585
.db
582-
.iter::<cf::Root>(IteratorMode::From(slot, IteratorDirection::Forward))?;
586+
.iter::<cf::Root>(IteratorMode::From(slot, direction))?;
583587
Ok(slot_iterator.map(move |(rooted_slot, _)| rooted_slot))
584588
}
585589

590+
pub fn rooted_slot_iterator(&self, slot: Slot) -> Result<impl Iterator<Item = Slot> + '_> {
591+
self.prepare_rooted_slot_iterator(slot, IteratorDirection::Forward)
592+
}
593+
594+
pub fn reversed_rooted_slot_iterator(
595+
&self,
596+
slot: Slot,
597+
) -> Result<impl Iterator<Item = Slot> + '_> {
598+
self.prepare_rooted_slot_iterator(slot, IteratorDirection::Reverse)
599+
}
600+
586601
fn get_recovery_data_shreds(
587602
index: &mut Index,
588603
set_index: u64,

local-cluster/src/local_cluster.rs

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,18 @@ use {
1717
gossip_service::discover_cluster,
1818
},
1919
solana_ledger::create_new_tmp_ledger,
20-
solana_runtime::genesis_utils::{
21-
create_genesis_config_with_vote_accounts_and_cluster_type, GenesisConfigInfo,
22-
ValidatorVoteKeypairs,
20+
solana_runtime::{
21+
genesis_utils::{
22+
create_genesis_config_with_vote_accounts_and_cluster_type, GenesisConfigInfo,
23+
ValidatorVoteKeypairs,
24+
},
25+
snapshot_config::SnapshotConfig,
2326
},
2427
solana_sdk::{
2528
account::Account,
2629
account::AccountSharedData,
2730
client::SyncClient,
28-
clock::{DEFAULT_DEV_SLOTS_PER_EPOCH, DEFAULT_TICKS_PER_SLOT},
31+
clock::{Slot, DEFAULT_DEV_SLOTS_PER_EPOCH, DEFAULT_TICKS_PER_SLOT},
2932
commitment_config::CommitmentConfig,
3033
epoch_schedule::EpochSchedule,
3134
genesis_config::{ClusterType, GenesisConfig},
@@ -50,10 +53,13 @@ use {
5053
collections::HashMap,
5154
io::{Error, ErrorKind, Result},
5255
iter,
56+
path::{Path, PathBuf},
5357
sync::{Arc, RwLock},
5458
},
5559
};
5660

61+
const DUMMY_SNAPSHOT_CONFIG_PATH_MARKER: &str = "dummy";
62+
5763
pub struct ClusterConfig {
5864
/// The validator config that should be applied to every node in the cluster
5965
pub validator_configs: Vec<ValidatorConfig>,
@@ -128,6 +134,23 @@ impl LocalCluster {
128134
Self::new(&mut config, socket_addr_space)
129135
}
130136

137+
fn sync_ledger_path_across_nested_config_fields(
138+
config: &mut ValidatorConfig,
139+
ledger_path: &Path,
140+
) {
141+
config.account_paths = vec![ledger_path.join("accounts")];
142+
config.tower_storage = Arc::new(FileTowerStorage::new(ledger_path.to_path_buf()));
143+
if let Some(snapshot_config) = &mut config.snapshot_config {
144+
let dummy: PathBuf = DUMMY_SNAPSHOT_CONFIG_PATH_MARKER.into();
145+
if snapshot_config.snapshot_archives_dir == dummy {
146+
snapshot_config.snapshot_archives_dir = ledger_path.to_path_buf();
147+
}
148+
if snapshot_config.bank_snapshots_dir == dummy {
149+
snapshot_config.bank_snapshots_dir = ledger_path.join("snapshot");
150+
}
151+
}
152+
}
153+
131154
pub fn new(config: &mut ClusterConfig, socket_addr_space: SocketAddrSpace) -> Self {
132155
assert_eq!(config.validator_configs.len(), config.node_stakes.len());
133156
let mut validator_keys = {
@@ -215,8 +238,7 @@ impl LocalCluster {
215238
let leader_contact_info = leader_node.info.clone();
216239
let mut leader_config = safe_clone_config(&config.validator_configs[0]);
217240
leader_config.rpc_addrs = Some((leader_node.info.rpc, leader_node.info.rpc_pubsub));
218-
leader_config.account_paths = vec![leader_ledger_path.join("accounts")];
219-
leader_config.tower_storage = Arc::new(FileTowerStorage::new(leader_ledger_path.clone()));
241+
Self::sync_ledger_path_across_nested_config_fields(&mut leader_config, &leader_ledger_path);
220242
let leader_keypair = Arc::new(Keypair::from_bytes(&leader_keypair.to_bytes()).unwrap());
221243
let leader_vote_keypair =
222244
Arc::new(Keypair::from_bytes(&leader_vote_keypair.to_bytes()).unwrap());
@@ -376,8 +398,7 @@ impl LocalCluster {
376398

377399
let mut config = safe_clone_config(validator_config);
378400
config.rpc_addrs = Some((validator_node.info.rpc, validator_node.info.rpc_pubsub));
379-
config.account_paths = vec![ledger_path.join("accounts")];
380-
config.tower_storage = Arc::new(FileTowerStorage::new(ledger_path.clone()));
401+
Self::sync_ledger_path_across_nested_config_fields(&mut config, &ledger_path);
381402
let voting_keypair = voting_keypair.unwrap();
382403
let validator_server = Validator::new(
383404
validator_node,
@@ -408,7 +429,7 @@ impl LocalCluster {
408429
validator_pubkey
409430
}
410431

411-
pub fn ledger_path(&self, validator_pubkey: &Pubkey) -> std::path::PathBuf {
432+
pub fn ledger_path(&self, validator_pubkey: &Pubkey) -> PathBuf {
412433
self.validators
413434
.get(validator_pubkey)
414435
.unwrap()
@@ -639,6 +660,19 @@ impl LocalCluster {
639660
)),
640661
}
641662
}
663+
664+
pub fn create_dummy_load_only_snapshot_config() -> SnapshotConfig {
665+
// DUMMY_SNAPSHOT_CONFIG_PATH_MARKER will be replaced with real value as part of cluster
666+
// node lifecycle.
667+
// There must be some place holder for now...
668+
SnapshotConfig {
669+
full_snapshot_archive_interval_slots: Slot::MAX,
670+
incremental_snapshot_archive_interval_slots: Slot::MAX,
671+
snapshot_archives_dir: DUMMY_SNAPSHOT_CONFIG_PATH_MARKER.into(),
672+
bank_snapshots_dir: DUMMY_SNAPSHOT_CONFIG_PATH_MARKER.into(),
673+
..SnapshotConfig::default()
674+
}
675+
}
642676
}
643677

644678
impl Cluster for LocalCluster {
@@ -713,10 +747,10 @@ impl Cluster for LocalCluster {
713747
) -> ClusterValidatorInfo {
714748
// Restart the node
715749
let validator_info = &cluster_validator_info.info;
716-
cluster_validator_info.config.account_paths =
717-
vec![validator_info.ledger_path.join("accounts")];
718-
cluster_validator_info.config.tower_storage =
719-
Arc::new(FileTowerStorage::new(validator_info.ledger_path.clone()));
750+
LocalCluster::sync_ledger_path_across_nested_config_fields(
751+
&mut cluster_validator_info.config,
752+
&validator_info.ledger_path,
753+
);
720754
let restarted_node = Validator::new(
721755
node,
722756
validator_info.keypair.clone(),

local-cluster/src/validator_configs.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
5959
no_wait_for_vote_to_start_leader: config.no_wait_for_vote_to_start_leader,
6060
accounts_shrink_ratio: config.accounts_shrink_ratio,
6161
accounts_db_config: config.accounts_db_config.clone(),
62+
no_hard_fork_blockstore_root_reconcilation_for_local_cluster_test: config
63+
.no_hard_fork_blockstore_root_reconcilation_for_local_cluster_test,
6264
}
6365
}
6466

0 commit comments

Comments
 (0)