Skip to content

Commit dba3e8f

Browse files
authored
Merge pull request #2364 from TheBlueMatt/2023-06-htlc-preimage-replay
Re-claim forwarded HTLCs on startup
2 parents a358ba2 + 9ce7e8e commit dba3e8f

File tree

2 files changed

+112
-36
lines changed

2 files changed

+112
-36
lines changed

lightning/src/ln/channel.rs

+4
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,10 @@ pub(super) struct ReestablishResponses {
527527
}
528528

529529
/// The return type of `force_shutdown`
530+
///
531+
/// Contains a (counterparty_node_id, funding_txo, [`ChannelMonitorUpdate`]) tuple
532+
/// followed by a list of HTLCs to fail back in the form of the (source, payment hash, and this
533+
/// channel's counterparty_node_id and channel_id).
530534
pub(crate) type ShutdownResult = (
531535
Option<(PublicKey, OutPoint, ChannelMonitorUpdate)>,
532536
Vec<(HTLCSource, PaymentHash, PublicKey, [u8; 32])>

lightning/src/ln/channelmanager.rs

+108-36
Original file line numberDiff line numberDiff line change
@@ -507,19 +507,19 @@ struct ClaimablePayments {
507507
/// running normally, and specifically must be processed before any other non-background
508508
/// [`ChannelMonitorUpdate`]s are applied.
509509
enum BackgroundEvent {
510-
/// Handle a ChannelMonitorUpdate which closes the channel. This is only separated from
511-
/// [`Self::MonitorUpdateRegeneratedOnStartup`] as the maybe-non-closing variant needs a public
512-
/// key to handle channel resumption, whereas if the channel has been force-closed we do not
513-
/// need the counterparty node_id.
510+
/// Handle a ChannelMonitorUpdate which closes the channel or for an already-closed channel.
511+
/// This is only separated from [`Self::MonitorUpdateRegeneratedOnStartup`] as the
512+
/// maybe-non-closing variant needs a public key to handle channel resumption, whereas if the
513+
/// channel has been force-closed we do not need the counterparty node_id.
514514
///
515515
/// Note that any such events are lost on shutdown, so in general they must be updates which
516516
/// are regenerated on startup.
517-
ClosingMonitorUpdateRegeneratedOnStartup((OutPoint, ChannelMonitorUpdate)),
517+
ClosedMonitorUpdateRegeneratedOnStartup((OutPoint, ChannelMonitorUpdate)),
518518
/// Handle a ChannelMonitorUpdate which may or may not close the channel and may unblock the
519519
/// channel to continue normal operation.
520520
///
521521
/// In general this should be used rather than
522-
/// [`Self::ClosingMonitorUpdateRegeneratedOnStartup`], however in cases where the
522+
/// [`Self::ClosedMonitorUpdateRegeneratedOnStartup`], however in cases where the
523523
/// `counterparty_node_id` is not available as the channel has closed from a [`ChannelMonitor`]
524524
/// error the other variant is acceptable.
525525
///
@@ -1114,7 +1114,6 @@ where
11141114
/// Notifier the lock contains sends out a notification when the lock is released.
11151115
total_consistency_lock: RwLock<()>,
11161116

1117-
#[cfg(debug_assertions)]
11181117
background_events_processed_since_startup: AtomicBool,
11191118

11201119
persistence_notifier: Notifier,
@@ -1917,9 +1916,7 @@ macro_rules! handle_new_monitor_update {
19171916
// update_maps_on_chan_removal needs to be able to take id_to_peer, so make sure we can in
19181917
// any case so that it won't deadlock.
19191918
debug_assert_ne!($self.id_to_peer.held_by_thread(), LockHeldState::HeldByThread);
1920-
#[cfg(debug_assertions)] {
1921-
debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire));
1922-
}
1919+
debug_assert!($self.background_events_processed_since_startup.load(Ordering::Acquire));
19231920
match $update_res {
19241921
ChannelMonitorUpdateStatus::InProgress => {
19251922
log_debug!($self.logger, "ChannelMonitor update for {} in flight, holding messages until the update completes.",
@@ -2111,7 +2108,6 @@ where
21112108
pending_events_processor: AtomicBool::new(false),
21122109
pending_background_events: Mutex::new(Vec::new()),
21132110
total_consistency_lock: RwLock::new(()),
2114-
#[cfg(debug_assertions)]
21152111
background_events_processed_since_startup: AtomicBool::new(false),
21162112
persistence_notifier: Notifier::new(),
21172113

@@ -4150,7 +4146,6 @@ where
41504146
fn process_background_events(&self) -> NotifyOption {
41514147
debug_assert_ne!(self.total_consistency_lock.held_by_thread(), LockHeldState::NotHeldByThread);
41524148

4153-
#[cfg(debug_assertions)]
41544149
self.background_events_processed_since_startup.store(true, Ordering::Release);
41554150

41564151
let mut background_events = Vec::new();
@@ -4161,7 +4156,7 @@ where
41614156

41624157
for event in background_events.drain(..) {
41634158
match event {
4164-
BackgroundEvent::ClosingMonitorUpdateRegeneratedOnStartup((funding_txo, update)) => {
4159+
BackgroundEvent::ClosedMonitorUpdateRegeneratedOnStartup((funding_txo, update)) => {
41654160
// The channel has already been closed, so no use bothering to care about the
41664161
// monitor updating completing.
41674162
let _ = self.chain_monitor.update_channel(funding_txo, &update);
@@ -4741,6 +4736,11 @@ where
47414736
-> Result<(), (PublicKey, MsgHandleErrInternal)> {
47424737
//TODO: Delay the claimed_funds relaying just like we do outbound relay!
47434738

4739+
// If we haven't yet run background events assume we're still deserializing and shouldn't
4740+
// actually pass `ChannelMonitorUpdate`s to users yet. Instead, queue them up as
4741+
// `BackgroundEvent`s.
4742+
let during_init = !self.background_events_processed_since_startup.load(Ordering::Acquire);
4743+
47444744
{
47454745
let per_peer_state = self.per_peer_state.read().unwrap();
47464746
let chan_id = prev_hop.outpoint.to_channel_id();
@@ -4767,14 +4767,26 @@ where
47674767
log_bytes!(chan_id), action);
47684768
peer_state.monitor_update_blocked_actions.entry(chan_id).or_insert(Vec::new()).push(action);
47694769
}
4770-
let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
4771-
peer_state, per_peer_state, chan);
4772-
if let Err(e) = res {
4773-
// TODO: This is a *critical* error - we probably updated the outbound edge
4774-
// of the HTLC's monitor with a preimage. We should retry this monitor
4775-
// update over and over again until morale improves.
4776-
log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage);
4777-
return Err((counterparty_node_id, e));
4770+
if !during_init {
4771+
let res = handle_new_monitor_update!(self, prev_hop.outpoint, monitor_update, peer_state_lock,
4772+
peer_state, per_peer_state, chan);
4773+
if let Err(e) = res {
4774+
// TODO: This is a *critical* error - we probably updated the outbound edge
4775+
// of the HTLC's monitor with a preimage. We should retry this monitor
4776+
// update over and over again until morale improves.
4777+
log_error!(self.logger, "Failed to update channel monitor with preimage {:?}", payment_preimage);
4778+
return Err((counterparty_node_id, e));
4779+
}
4780+
} else {
4781+
// If we're running during init we cannot update a monitor directly -
4782+
// they probably haven't actually been loaded yet. Instead, push the
4783+
// monitor update as a background event.
4784+
self.pending_background_events.lock().unwrap().push(
4785+
BackgroundEvent::MonitorUpdateRegeneratedOnStartup {
4786+
counterparty_node_id,
4787+
funding_txo: prev_hop.outpoint,
4788+
update: monitor_update.clone(),
4789+
});
47784790
}
47794791
}
47804792
return Ok(());
@@ -4787,16 +4799,34 @@ where
47874799
payment_preimage,
47884800
}],
47894801
};
4790-
// We update the ChannelMonitor on the backward link, after
4791-
// receiving an `update_fulfill_htlc` from the forward link.
4792-
let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
4793-
if update_res != ChannelMonitorUpdateStatus::Completed {
4794-
// TODO: This needs to be handled somehow - if we receive a monitor update
4795-
// with a preimage we *must* somehow manage to propagate it to the upstream
4796-
// channel, or we must have an ability to receive the same event and try
4797-
// again on restart.
4798-
log_error!(self.logger, "Critical error: failed to update channel monitor with preimage {:?}: {:?}",
4799-
payment_preimage, update_res);
4802+
4803+
if !during_init {
4804+
// We update the ChannelMonitor on the backward link, after
4805+
// receiving an `update_fulfill_htlc` from the forward link.
4806+
let update_res = self.chain_monitor.update_channel(prev_hop.outpoint, &preimage_update);
4807+
if update_res != ChannelMonitorUpdateStatus::Completed {
4808+
// TODO: This needs to be handled somehow - if we receive a monitor update
4809+
// with a preimage we *must* somehow manage to propagate it to the upstream
4810+
// channel, or we must have an ability to receive the same event and try
4811+
// again on restart.
4812+
log_error!(self.logger, "Critical error: failed to update channel monitor with preimage {:?}: {:?}",
4813+
payment_preimage, update_res);
4814+
}
4815+
} else {
4816+
// If we're running during init we cannot update a monitor directly - they probably
4817+
// haven't actually been loaded yet. Instead, push the monitor update as a background
4818+
// event.
4819+
// Note that while it's safe to use `ClosedMonitorUpdateRegeneratedOnStartup` here (the
4820+
// channel is already closed) we need to ultimately handle the monitor update
4821+
// completion action only after we've completed the monitor update. This is the only
4822+
// way to guarantee this update *will* be regenerated on startup (otherwise if this was
4823+
// from a forwarded HTLC the downstream preimage may be deleted before we claim
4824+
// upstream). Thus, we need to transition to some new `BackgroundEvent` type which will
4825+
// complete the monitor update completion action from `completion_action`.
4826+
self.pending_background_events.lock().unwrap().push(
4827+
BackgroundEvent::ClosedMonitorUpdateRegeneratedOnStartup((
4828+
prev_hop.outpoint, preimage_update,
4829+
)));
48004830
}
48014831
// Note that we do process the completion action here. This totally could be a
48024832
// duplicate claim, but we have no way of knowing without interrogating the
@@ -4814,6 +4844,8 @@ where
48144844
fn claim_funds_internal(&self, source: HTLCSource, payment_preimage: PaymentPreimage, forwarded_htlc_value_msat: Option<u64>, from_onchain: bool, next_channel_id: [u8; 32]) {
48154845
match source {
48164846
HTLCSource::OutboundRoute { session_priv, payment_id, path, .. } => {
4847+
debug_assert!(self.background_events_processed_since_startup.load(Ordering::Acquire),
4848+
"We don't support claim_htlc claims during startup - monitors may not be available yet");
48174849
self.pending_outbound_payments.claim_htlc(payment_id, payment_preimage, session_priv, path, from_onchain, &self.pending_events, &self.logger);
48184850
},
48194851
HTLCSource::PreviousHopData(hop_data) => {
@@ -5748,7 +5780,7 @@ where
57485780
}
57495781

57505782
/// Checks whether [`ChannelMonitorUpdate`]s generated by the receipt of a remote
5751-
/// [`msgs::RevokeAndACK`] should be held for the given channel until some other event
5783+
/// [`msgs::RevokeAndACK`] should be held for the given channel until some other action
57525784
/// completes. Note that this needs to happen in the same [`PeerState`] mutex as any release of
57535785
/// the [`ChannelMonitorUpdate`] in question.
57545786
fn raa_monitor_updates_held(&self,
@@ -6365,7 +6397,7 @@ where
63656397
/// When something which was blocking a channel from updating its [`ChannelMonitor`] (e.g. an
63666398
/// [`Event`] being handled) completes, this should be called to restore the channel to normal
63676399
/// operation. It will double-check that nothing *else* is also blocking the same channel from
6368-
/// making progress and then any blocked [`ChannelMonitorUpdate`]s fly.
6400+
/// making progress and then let any blocked [`ChannelMonitorUpdate`]s fly.
63696401
fn handle_monitor_update_release(&self, counterparty_node_id: PublicKey, channel_funding_outpoint: OutPoint, mut completed_blocker: Option<RAAMonitorUpdateBlockingAction>) {
63706402
let mut errors = Vec::new();
63716403
loop {
@@ -8294,7 +8326,7 @@ where
82948326
update_id: CLOSED_CHANNEL_UPDATE_ID,
82958327
updates: vec![ChannelMonitorUpdateStep::ChannelForceClosed { should_broadcast: true }],
82968328
};
8297-
close_background_events.push(BackgroundEvent::ClosingMonitorUpdateRegeneratedOnStartup((*funding_txo, monitor_update)));
8329+
close_background_events.push(BackgroundEvent::ClosedMonitorUpdateRegeneratedOnStartup((*funding_txo, monitor_update)));
82988330
}
82998331
}
83008332

@@ -8549,6 +8581,11 @@ where
85498581
// Note that we have to do the above replays before we push new monitor updates.
85508582
pending_background_events.append(&mut close_background_events);
85518583

8584+
// If there's any preimages for forwarded HTLCs hanging around in ChannelMonitors we
8585+
// should ensure we try them again on the inbound edge. We put them here and do so after we
8586+
// have a fully-constructed `ChannelManager` at the end.
8587+
let mut pending_claims_to_replay = Vec::new();
8588+
85528589
{
85538590
// If we're tracking pending payments, ensure we haven't lost any by looking at the
85548591
// ChannelMonitor data for any channels for which we do not have authorative state
@@ -8559,7 +8596,8 @@ where
85598596
// We only rebuild the pending payments map if we were most recently serialized by
85608597
// 0.0.102+
85618598
for (_, monitor) in args.channel_monitors.iter() {
8562-
if id_to_peer.get(&monitor.get_funding_txo().0.to_channel_id()).is_none() {
8599+
let counterparty_opt = id_to_peer.get(&monitor.get_funding_txo().0.to_channel_id());
8600+
if counterparty_opt.is_none() {
85638601
for (htlc_source, (htlc, _)) in monitor.get_pending_or_resolved_outbound_htlcs() {
85648602
if let HTLCSource::OutboundRoute { payment_id, session_priv, path, .. } = htlc_source {
85658603
if path.hops.is_empty() {
@@ -8653,6 +8691,33 @@ where
86538691
}
86548692
}
86558693
}
8694+
8695+
// Whether the downstream channel was closed or not, try to re-apply any payment
8696+
// preimages from it which may be needed in upstream channels for forwarded
8697+
// payments.
8698+
let outbound_claimed_htlcs_iter = monitor.get_all_current_outbound_htlcs()
8699+
.into_iter()
8700+
.filter_map(|(htlc_source, (htlc, preimage_opt))| {
8701+
if let HTLCSource::PreviousHopData(_) = htlc_source {
8702+
if let Some(payment_preimage) = preimage_opt {
8703+
Some((htlc_source, payment_preimage, htlc.amount_msat,
8704+
// Check if `counterparty_opt.is_none()` to see if the
8705+
// downstream chan is closed (because we don't have a
8706+
// channel_id -> peer map entry).
8707+
counterparty_opt.is_none(),
8708+
monitor.get_funding_txo().0.to_channel_id()))
8709+
} else { None }
8710+
} else {
8711+
// If it was an outbound payment, we've handled it above - if a preimage
8712+
// came in and we persisted the `ChannelManager` we either handled it and
8713+
// are good to go or the channel force-closed - we don't have to handle the
8714+
// channel still live case here.
8715+
None
8716+
}
8717+
});
8718+
for tuple in outbound_claimed_htlcs_iter {
8719+
pending_claims_to_replay.push(tuple);
8720+
}
86568721
}
86578722
}
86588723

@@ -8885,7 +8950,6 @@ where
88858950
pending_events_processor: AtomicBool::new(false),
88868951
pending_background_events: Mutex::new(pending_background_events),
88878952
total_consistency_lock: RwLock::new(()),
8888-
#[cfg(debug_assertions)]
88898953
background_events_processed_since_startup: AtomicBool::new(false),
88908954
persistence_notifier: Notifier::new(),
88918955

@@ -8904,6 +8968,14 @@ where
89048968
channel_manager.fail_htlc_backwards_internal(&source, &payment_hash, &reason, receiver);
89058969
}
89068970

8971+
for (source, preimage, downstream_value, downstream_closed, downstream_chan_id) in pending_claims_to_replay {
8972+
// We use `downstream_closed` in place of `from_onchain` here just as a guess - we
8973+
// don't remember in the `ChannelMonitor` where we got a preimage from, but if the
8974+
// channel is closed we just assume that it probably came from an on-chain claim.
8975+
channel_manager.claim_funds_internal(source, preimage, Some(downstream_value),
8976+
downstream_closed, downstream_chan_id);
8977+
}
8978+
89078979
//TODO: Broadcast channel update for closed channels, but only after we've made a
89088980
//connection or two.
89098981

0 commit comments

Comments
 (0)