Skip to content

Commit 10ae04e

Browse files
committed
Trust Quorum: Handle prepare messages + Alarms
This builds on #8052. Node's now handle `PrepareMsg`s from coordinators. The coordinator proptest was updated to generate prepares from non-existent test only nodes and send them to the coordinator. Additionally, protocol invariant violations are now detected in a few cases and recorded to the `PersistentState`. This is for debugging and support purposes. The goal is to test the code well enough that we never actually see an alarm in production.
1 parent cbb7db4 commit 10ae04e

File tree

7 files changed

+362
-14
lines changed

7 files changed

+362
-14
lines changed

trust-quorum/src/alarm.rs

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! A mechanism for reporting protocol invariant violations
6+
//!
7+
//! Invariant violations should _never_ occur. They represent a critical bug in
8+
//! the implementation of the system. In certain scenarios we can detect these
9+
//! invariant violations and record them. This allows reporting them to higher
10+
//! levels of control plane software so that we can debug them and fix them in
11+
//! future releases, as well as rectify outstanding issues on systems where such
12+
//! an alarm arose.
13+
14+
use crate::{Epoch, PlatformId};
15+
use serde::{Deserialize, Serialize};
16+
17+
/// A critical invariant violation that should never occur.
18+
///
19+
/// Many invariant violations are only possible on receipt of peer messages,
20+
/// and are _not_ a result of API calls. This means that there isn't a good
21+
/// way to directly inform the rest of the control plane. Instead we provide a
22+
/// queryable API for `crate::Node` status that includes alerts.
23+
///
24+
/// If an `Alarm` is ever seen by an operator then support should be contacted
25+
/// immediately.
26+
#[derive(
27+
Debug, Clone, thiserror::Error, PartialEq, Eq, Serialize, Deserialize,
28+
)]
29+
pub enum Alarm {
30+
#[error(
31+
"prepare for a later configuration exists: \
32+
last_prepared_epoch = {last_prepared_epoch:?}, \
33+
commit_epoch = {commit_epoch}"
34+
)]
35+
OutOfOrderCommit { last_prepared_epoch: Option<Epoch>, commit_epoch: Epoch },
36+
37+
#[error("commit attempted, but missing prepare message: epoch = {epoch}")]
38+
MissingPrepare { epoch: Epoch },
39+
40+
#[error(
41+
"prepare received with mismatched last_committed_epoch: prepare's last \
42+
committed epoch = {prepare_last_committed_epoch:?}, persisted \
43+
prepare's last_committed_epoch = \
44+
{persisted_prepare_last_committed_epoch:?}"
45+
)]
46+
PrepareLastCommittedEpochMismatch {
47+
prepare_last_committed_epoch: Option<Epoch>,
48+
persisted_prepare_last_committed_epoch: Option<Epoch>,
49+
},
50+
51+
#[error(
52+
"different nodes coordinating same epoch = {epoch}: \
53+
them = {them}, us = {us}"
54+
)]
55+
DifferentNodesCoordinatingSameEpoch {
56+
epoch: Epoch,
57+
them: PlatformId,
58+
us: PlatformId,
59+
},
60+
}

trust-quorum/src/crypto.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ use zeroize::{Zeroize, ZeroizeOnDrop, Zeroizing};
2929
const LRTQ_SHARE_SIZE: usize = 33;
3030

3131
/// We don't distinguish whether this is an Ed25519 Scalar or set of GF(256)
32-
/// polynomials points with an x-coordinate of 0. Both can be treated as 32 byte
33-
/// blobs when decrypted, as they are immediately fed into HKDF.
32+
/// polynomials' points with an x-coordinate of 0. Both can be treated as 32
33+
/// byte blobs when decrypted, as they are immediately fed into HKDF.
3434
#[derive(
3535
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize,
3636
)]

trust-quorum/src/lib.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
use derive_more::Display;
1313
use serde::{Deserialize, Serialize};
1414

15+
mod alarm;
1516
mod configuration;
1617
mod coordinator_state;
1718
pub(crate) mod crypto;
@@ -20,9 +21,10 @@ mod messages;
2021
mod node;
2122
mod persistent_state;
2223
mod validators;
23-
pub use configuration::Configuration;
24+
pub use alarm::Alarm;
25+
pub use configuration::{Configuration, PreviousConfiguration};
2426
pub(crate) use coordinator_state::CoordinatorState;
25-
pub use crypto::RackSecret;
27+
pub use crypto::{EncryptedRackSecret, RackSecret, Salt, Sha3_256Digest};
2628
pub use messages::*;
2729
pub use node::Node;
2830
pub use persistent_state::{PersistentState, PersistentStateSummary};

trust-quorum/src/node.rs

+152-6
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
use crate::errors::{CommitError, MismatchedRackIdError, ReconfigurationError};
88
use crate::validators::ValidatedReconfigureMsg;
99
use crate::{
10-
CoordinatorState, Envelope, Epoch, PersistentState, PlatformId, messages::*,
10+
Alarm, CoordinatorState, Envelope, Epoch, PersistentState, PlatformId,
11+
messages::*,
1112
};
1213
use gfss::shamir::Share;
1314
use omicron_uuid_kinds::RackUuid;
@@ -110,10 +111,14 @@ impl Node {
110111
if last_prepared_epoch != Some(epoch) {
111112
error!(
112113
self.log,
113-
"Commit message occurred out of order";
114+
"Protocol invariant violation: commit message out of order";
114115
"epoch" => %epoch,
115116
"last_prepared_epoch" => ?last_prepared_epoch
116117
);
118+
self.persistent_state.add_alarm(Alarm::OutOfOrderCommit {
119+
last_prepared_epoch,
120+
commit_epoch: epoch,
121+
});
117122
return Err(CommitError::OutOfOrderCommit);
118123
}
119124
if let Some(prepare) = self.persistent_state.prepares.get(&epoch) {
@@ -136,11 +141,19 @@ impl Node {
136141
//
137142
// Nexus should instead tell this node to retrieve a `Prepare`
138143
// from another node that has already committed.
139-
error!(
140-
self.log,
141-
"tried to commit a configuration, but missing prepare msg";
142-
"epoch" => %epoch
144+
//
145+
// This is a protocol bug because nexus must have seen an
146+
// acknowledgement that this node had a `PrepareMsg` or erroneously
147+
// sent a `Commit` anyway. This is a less serious error than other
148+
// invariant violations since it can be recovered from. However, it
149+
// is still worthy of an alarm, as the most likely case is a disk/
150+
// ledger failure.
151+
let err_msg = concat!(
152+
"Protocol invariant violation: triedi to ",
153+
"commit a configuration, but missing prepare msg"
143154
);
155+
error!(self.log, "{err_msg}"; "epoch" => %epoch);
156+
self.persistent_state.add_alarm(Alarm::MissingPrepare { epoch });
144157
return Err(CommitError::MissingPrepare);
145158
}
146159

@@ -178,6 +191,7 @@ impl Node {
178191
msg: PeerMsg,
179192
) -> Option<PersistentState> {
180193
match msg {
194+
PeerMsg::Prepare(msg) => self.handle_prepare(outbox, from, msg),
181195
PeerMsg::PrepareAck(epoch) => {
182196
self.handle_prepare_ack(from, epoch);
183197
None
@@ -196,6 +210,138 @@ impl Node {
196210
self.coordinator_state.as_ref()
197211
}
198212

213+
fn handle_prepare(
214+
&mut self,
215+
outbox: &mut Vec<Envelope>,
216+
from: PlatformId,
217+
msg: PrepareMsg,
218+
) -> Option<PersistentState> {
219+
// TODO: check rack_id
220+
if let Some(last_prepared_epoch) =
221+
self.persistent_state.last_prepared_epoch()
222+
{
223+
// Is this an old request?
224+
if msg.config.epoch < last_prepared_epoch {
225+
warn!(self.log, "Received stale prepare";
226+
"from" => %from,
227+
"msg_epoch" => %msg.config.epoch,
228+
"last_prepared_epoch" => %last_prepared_epoch
229+
);
230+
// TODO: Respond to sender?
231+
return None;
232+
}
233+
234+
// Idempotent request
235+
if msg.config.epoch == last_prepared_epoch {
236+
return None;
237+
}
238+
239+
// Does the last committed epoch match what we have?
240+
let msg_last_committed_epoch =
241+
msg.config.previous_configuration.as_ref().map(|p| p.epoch);
242+
let last_committed_epoch =
243+
self.persistent_state.last_committed_epoch();
244+
if msg_last_committed_epoch != last_committed_epoch {
245+
// If the msg contains an older last_committed_epoch than what
246+
// we have, then out of order commits have occurred, as we know
247+
// this prepare is later than what we've seen. This is a critical
248+
// protocol invariant that has been violated.
249+
//
250+
// If the msg contains a newer last_committed_epoch than what
251+
// we have, then we have likely missed a commit and are behind
252+
// by more than one reconfiguration. The protocol currently does
253+
// not allow this. Future protocol implementations may provide a
254+
// capability to "jump" configurations.
255+
//
256+
// If there is a `None`/`Some` mismatch, then again, an invariant
257+
// has been violated somewhere. The coordinator should know
258+
// whether this is a "new" node or not, which would have a "None"
259+
// configuration.
260+
let err_msg = concat!(
261+
"Protocol invariant violation: ",
262+
"PrepareMsg last_committed_epoch does not match ours"
263+
);
264+
error!(self.log, "{err_msg}";
265+
"msg_last_committed_epoch" => ?msg_last_committed_epoch,
266+
"last_committed_epoch" => ?last_committed_epoch
267+
);
268+
self.persistent_state.add_alarm(
269+
Alarm::PrepareLastCommittedEpochMismatch {
270+
prepare_last_committed_epoch: msg_last_committed_epoch,
271+
persisted_prepare_last_committed_epoch:
272+
last_committed_epoch,
273+
},
274+
);
275+
return Some(self.persistent_state.clone());
276+
}
277+
278+
// The prepare is up-to-date with respect to our persistent state
279+
};
280+
281+
// Are we currently trying to coordinate?
282+
if let Some(cs) = &self.coordinator_state {
283+
let coordinating_epoch = cs.reconfigure_msg().epoch();
284+
if coordinating_epoch > msg.config.epoch {
285+
warn!(self.log, "Received stale prepare while coordinating";
286+
"from" => %from,
287+
"msg_epoch" => %msg.config.epoch,
288+
"epoch" => %cs.reconfigure_msg().epoch()
289+
);
290+
return None;
291+
}
292+
if coordinating_epoch == msg.config.epoch {
293+
// TODO: Track these in an "invariant violation alarm" struct
294+
// that can be queried.
295+
let err_msg = concat!(
296+
"Protocol invariant violation: Nexus has told different",
297+
"nodes to coordinate the same reconfiguration."
298+
);
299+
error!(self.log, "{err_msg}";
300+
"epoch" => %coordinating_epoch,
301+
"from" => %from,
302+
"id" => %self.platform_id
303+
304+
);
305+
self.persistent_state.add_alarm(
306+
Alarm::DifferentNodesCoordinatingSameEpoch {
307+
epoch: coordinating_epoch,
308+
them: from,
309+
us: self.platform_id.clone(),
310+
},
311+
);
312+
return Some(self.persistent_state.clone());
313+
}
314+
315+
// This prepare is for a newer configuration than the one we
316+
// are currently coordinating. We must implicitly cancel our
317+
// coordination as Nexus has moved on.
318+
let cancel_msg = concat!(
319+
"Received a prepare for newer configuration.",
320+
"Cancelling our coordination."
321+
);
322+
info!(self.log, "{cancel_msg}";
323+
"msg_epoch" => %msg.config.epoch,
324+
"epoch" => %coordinating_epoch,
325+
"from" => %from
326+
);
327+
}
328+
329+
// Acknowledge the PrepareMsg
330+
outbox.push(Envelope {
331+
to: from,
332+
from: self.platform_id.clone(),
333+
msg: PeerMsg::PrepareAck(msg.config.epoch),
334+
});
335+
336+
// Add the prepare to our `PersistentState`
337+
self.persistent_state.prepares.insert(msg.config.epoch, msg);
338+
339+
// Stop coordinating
340+
self.coordinator_state = None;
341+
342+
Some(self.persistent_state.clone())
343+
}
344+
199345
// Handle receipt of a `PrepareAck` message
200346
fn handle_prepare_ack(&mut self, from: PlatformId, epoch: Epoch) {
201347
// Are we coordinating for this epoch?

trust-quorum/src/persistent_state.rs

+25-1
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,19 @@
88
99
use crate::crypto::LrtqShare;
1010
use crate::messages::PrepareMsg;
11-
use crate::{Configuration, Epoch, PlatformId};
11+
use crate::{Alarm, Configuration, Epoch, PlatformId};
1212
use bootstore::schemes::v0::SharePkgCommon as LrtqShareData;
1313
use gfss::shamir::Share;
1414
use omicron_uuid_kinds::{GenericUuid, RackUuid};
1515
use serde::{Deserialize, Serialize};
1616
use std::collections::{BTreeMap, BTreeSet};
1717

18+
// We want to prevent the situation where invariant violations keep replaying
19+
// due to message receipt and we blow up memory / storage with them. In
20+
// practice, even one `Alarm` is a critical bug, but we leave room to track a
21+
// few more.
22+
const MAX_ALARMS: usize = 10;
23+
1824
/// All the persistent state for this protocol
1925
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
2026
pub struct PersistentState {
@@ -30,6 +36,14 @@ pub struct PersistentState {
3036
// node. The sled corresponding to the node must be factory reset by wiping
3137
// its storage.
3238
pub decommissioned: Option<DecommissionedMetadata>,
39+
40+
// We persist [`Alarm`]s so that they can't be lost
41+
//
42+
// During a support scenario, any existing alarms must be manually cleared
43+
// as part of resolving the issue. In principal, there should be *no* alarms
44+
// as they are only for protocol invariant violations, so this should never
45+
// be necessary.
46+
alarms: Vec<Alarm>,
3347
}
3448

3549
impl PersistentState {
@@ -39,6 +53,7 @@ impl PersistentState {
3953
prepares: BTreeMap::new(),
4054
commits: BTreeSet::new(),
4155
decommissioned: None,
56+
alarms: Vec::new(),
4257
}
4358
}
4459

@@ -93,6 +108,13 @@ impl PersistentState {
93108
self.prepares.get(&epoch).expect("missing prepare").share.clone()
94109
})
95110
}
111+
112+
// Add an alarm, if we haven't already reached the max number of alarms.
113+
pub fn add_alarm(&mut self, alarm: Alarm) {
114+
if self.alarms.len() < MAX_ALARMS {
115+
self.alarms.push(alarm)
116+
}
117+
}
96118
}
97119

98120
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -115,6 +137,7 @@ pub struct PersistentStateSummary {
115137
pub last_prepared_epoch: Option<Epoch>,
116138
pub last_committed_epoch: Option<Epoch>,
117139
pub decommissioned: Option<DecommissionedMetadata>,
140+
pub alarms: Vec<Alarm>,
118141
}
119142

120143
impl From<&PersistentState> for PersistentStateSummary {
@@ -126,6 +149,7 @@ impl From<&PersistentState> for PersistentStateSummary {
126149
last_prepared_epoch: value.last_prepared_epoch(),
127150
last_committed_epoch: value.last_committed_epoch(),
128151
decommissioned: value.decommissioned.clone(),
152+
alarms: value.alarms.clone(),
129153
}
130154
}
131155
}

0 commit comments

Comments
 (0)