Skip to content

hsmd: Add hsmd_forget_channel to tell hsmd to delete a channel #101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: 2023-11-remote-hsmd-v23.11
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions contrib/pyln-testing/pyln/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,7 @@ def getnewaddress(self):


class ValidatingLightningSignerD(TailableProc):
def __init__(self, vlsd_dir, vlsd_port, node_id, network):
def __init__(self, vlsd_dir, vlsd_port, vlsd_rpc_port, node_id, network):
TailableProc.__init__(self, vlsd_dir, verbose=True)
self.executable = env("REMOTE_SIGNER_CMD", 'vlsd2')
os.environ['ALLOWLIST'] = env(
Expand All @@ -629,6 +629,7 @@ def __init__(self, vlsd_dir, vlsd_port, node_id, network):
'--network={}'.format(network),
'--datadir={}'.format(vlsd_dir),
'--connect=http://localhost:{}'.format(vlsd_port),
'--rpc-server-port={}'.format(vlsd_rpc_port),
'--integration-test',
]
self.prefix = 'vlsd2-%d' % (node_id)
Expand Down Expand Up @@ -676,6 +677,7 @@ def __init__(
self.use_vlsd = False
self.vlsd_dir = os.path.join(lightning_dir, "vlsd")
self.vlsd_port = None
self.vlsd_rpc_server_port = None
self.vlsd = None
self.node_id = node_id

Expand Down Expand Up @@ -794,6 +796,7 @@ def start(self, stdin=None, wait_for_initialized=True, stderr_redir=False):

if self.use_vlsd:
self.vlsd_port = reserve_unused_port()
self.vlsd_rpc_server_port = reserve_unused_port()
# We can't do this in the constructor because we need a new port on each restart.
self.env['VLS_PORT'] = str(self.vlsd_port)
# Kill any previous vlsd (we may have been restarted)
Expand All @@ -806,7 +809,7 @@ def start(self, stdin=None, wait_for_initialized=True, stderr_redir=False):
if self.use_vlsd:
# Start the remote signer first
self.vlsd = ValidatingLightningSignerD(
self.vlsd_dir, self.vlsd_port, self.node_id, self.opts['network'])
self.vlsd_dir, self.vlsd_port, self.vlsd_rpc_server_port, self.node_id, self.opts['network'])
self.vlsd.start(
stdin, stdout_redir=True, stderr_redir=True,
wait_for_initialized=wait_for_initialized)
Expand Down
2 changes: 2 additions & 0 deletions hsmd/hsmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ static struct io_plan *handle_client(struct io_conn *conn, struct client *c)
case WIRE_HSMD_SETUP_CHANNEL:
case WIRE_HSMD_CHECK_OUTPOINT:
case WIRE_HSMD_LOCK_OUTPOINT:
case WIRE_HSMD_FORGET_CHANNEL:
case WIRE_HSMD_SIGN_COMMITMENT_TX:
case WIRE_HSMD_VALIDATE_COMMITMENT_TX:
case WIRE_HSMD_VALIDATE_REVOCATION:
Expand Down Expand Up @@ -694,6 +695,7 @@ static struct io_plan *handle_client(struct io_conn *conn, struct client *c)
case WIRE_HSMD_SETUP_CHANNEL_REPLY:
case WIRE_HSMD_CHECK_OUTPOINT_REPLY:
case WIRE_HSMD_LOCK_OUTPOINT_REPLY:
case WIRE_HSMD_FORGET_CHANNEL_REPLY:
case WIRE_HSMD_NODE_ANNOUNCEMENT_SIG_REPLY:
case WIRE_HSMD_SIGN_WITHDRAWAL_REPLY:
case WIRE_HSMD_SIGN_INVOICE_REPLY:
Expand Down
8 changes: 8 additions & 0 deletions hsmd/hsmd_wire.csv
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ msgdata,hsmd_lock_outpoint,funding_txout,u16,
# No value returned.
msgtype,hsmd_lock_outpoint_reply,137

# Forget channel.
msgtype,hsmd_forget_channel,34
msgdata,hsmd_forget_channel,id,node_id,
msgdata,hsmd_forget_channel,dbid,u64,

# No value returned.
msgtype,hsmd_forget_channel_reply,134

# Return signature for a funding tx.
#include <common/utxo.h>

Expand Down
21 changes: 21 additions & 0 deletions hsmd/libhsmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ bool hsmd_check_client_capabilities(struct hsmd_client *client,

case WIRE_HSMD_INIT:
case WIRE_HSMD_NEW_CHANNEL:
case WIRE_HSMD_FORGET_CHANNEL:
case WIRE_HSMD_CLIENT_HSMFD:
case WIRE_HSMD_SIGN_WITHDRAWAL:
case WIRE_HSMD_SIGN_INVOICE:
Expand Down Expand Up @@ -150,6 +151,7 @@ bool hsmd_check_client_capabilities(struct hsmd_client *client,
case WIRE_HSMD_SETUP_CHANNEL_REPLY:
case WIRE_HSMD_CHECK_OUTPOINT_REPLY:
case WIRE_HSMD_LOCK_OUTPOINT_REPLY:
case WIRE_HSMD_FORGET_CHANNEL_REPLY:
case WIRE_HSMD_NODE_ANNOUNCEMENT_SIG_REPLY:
case WIRE_HSMD_SIGN_WITHDRAWAL_REPLY:
case WIRE_HSMD_SIGN_INVOICE_REPLY:
Expand Down Expand Up @@ -382,6 +384,21 @@ static u8 *handle_setup_channel(struct hsmd_client *c, const u8 *msg_in)
return towire_hsmd_setup_channel_reply(NULL);
}

/* ~This stub implementation is overriden by fully validating signers
* that need to manage per-channel state. */
static u8 *handle_forget_channel(struct hsmd_client *c, const u8 *msg_in)
{
struct node_id peer_id;
u64 dbid;

if (!fromwire_hsmd_forget_channel(msg_in, &peer_id, &dbid))
return hsmd_status_malformed_request(c, msg_in);

/* Stub implementation */

return towire_hsmd_forget_channel_reply(NULL);
}

/* ~This stub implementation is overriden by fully validating signers
* to ensure they are caught up when outpoints are freshly buried */
static u8 *handle_check_outpoint(struct hsmd_client *c, const u8 *msg_in)
Expand Down Expand Up @@ -1945,6 +1962,8 @@ u8 *hsmd_handle_client_message(const tal_t *ctx, struct hsmd_client *client,
return handle_check_outpoint(client, msg);
case WIRE_HSMD_LOCK_OUTPOINT:
return handle_lock_outpoint(client, msg);
case WIRE_HSMD_FORGET_CHANNEL:
return handle_forget_channel(client, msg);
case WIRE_HSMD_GET_OUTPUT_SCRIPTPUBKEY:
return handle_get_output_scriptpubkey(client, msg);
case WIRE_HSMD_CHECK_FUTURE_SECRET:
Expand Down Expand Up @@ -2024,6 +2043,7 @@ u8 *hsmd_handle_client_message(const tal_t *ctx, struct hsmd_client *client,
case WIRE_HSMD_SETUP_CHANNEL_REPLY:
case WIRE_HSMD_CHECK_OUTPOINT_REPLY:
case WIRE_HSMD_LOCK_OUTPOINT_REPLY:
case WIRE_HSMD_FORGET_CHANNEL_REPLY:
case WIRE_HSMD_NODE_ANNOUNCEMENT_SIG_REPLY:
case WIRE_HSMD_SIGN_WITHDRAWAL_REPLY:
case WIRE_HSMD_SIGN_INVOICE_REPLY:
Expand Down Expand Up @@ -2067,6 +2087,7 @@ u8 *hsmd_init(struct secret hsm_secret,
WIRE_HSMD_SIGN_HTLC_TX_MINGLE,
WIRE_HSMD_SIGN_SPLICE_TX,
WIRE_HSMD_CHECK_OUTPOINT,
WIRE_HSMD_FORGET_CHANNEL,
};

/*~ Don't swap this. */
Expand Down
12 changes: 12 additions & 0 deletions lightningd/channel.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,21 @@ static void destroy_channel(struct channel *channel)

void delete_channel(struct channel *channel STEALS)
{
const u8 *msg;

struct peer *peer = channel->peer;
if (channel->dbid != 0)
wallet_channel_close(channel->peer->ld->wallet, channel->dbid);

/* Tell the hsm to forget the channel, needs to be after it's
* been forgotten here */
if (hsm_capable(channel->peer->ld, WIRE_HSMD_FORGET_CHANNEL)) {
msg = towire_hsmd_forget_channel(NULL, &channel->peer->id, channel->dbid);
msg = hsm_sync_req(tmpctx, channel->peer->ld, take(msg));
if (!fromwire_hsmd_forget_channel_reply(msg))
fatal("HSM gave bad hsm_forget_channel_reply %s", tal_hex(msg, msg));
}
Comment on lines +97 to +105
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not know when but if the hsmd is telling us that we can not close the channel? it is better to leave the database untouch?

In other words do you think that it is better move this code on top of the https://github.com/lightning-signer/c-lightning/pull/101/files#diff-109e3febdfc51acb2389960ebf7af4fab4deaff873eb35e09a57b3efea5225f3R96 ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts:

  • The forget_channel calls happen 100 blocks after all future activity is not possible because the channel has been closed, swept, etc. It's not about "closing" the channel but rather promising that it will never be referred to again so it's ok to recover it's resources (for VLS recovering the memory is critical on small embedded devices (like the SphinxSigner, aka Stakwork))
  • Both CLN and VLS appear to have consistent strategy, forgetting the channel after the last bits are onchain and buried by 100 blocks.
  • The problem is when there is confusion with either CLN or VLS out-of-date etc
  • VLS's handler for forget_channel does almost nothing, it merely sets a flag on the channel that tells us that we saw the node announce that it had forgotten it

So there are two "bad" cases, both caused by only part of the routine above completing:

  1. VLS forgets but CLN does not
  2. CLN forgets but VLS does not

In case #1 we have a terrible outcome, CLN asks VLS to sign something related to the channel (the defensive rebroadcast of the "last tx, for example") but the channel is unknown to VLS. This what we saw on home4. VLS panics, no progress is made because CLN panics as well and restarts and then reissues the same operation again and again ...

Case #2 is not so bad, CLN goes on having forgotten about the channel and VLS will warn "hey, I haven't see the expected forget_channel from CLN for this channel and keeps holding off. After 2016 blocks (roughly 2 weeks) VLS will give up and delete the channel. So some memory is not available for other channels in the meantime. But it heals in 2 weeks.

So we vastly prefer the 2nd alternative which corresponds to CLN telling it forgot after it forgets for sure.

The operations takeaway is that if you see this warning persisting it is good to check on the channel status on the node side. If something on the CLN side is a little broken you can use the --developer lightning-cli dev-forget-channel id=0248691d3963b05a... to clean it up.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sense, thanks to write it down


tal_free(channel);

maybe_delete_peer(peer);
Expand Down
6 changes: 6 additions & 0 deletions wallet/test/run-wallet.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,9 @@ bool fromwire_hsmd_init_reply_v4(const tal_t *ctx UNNEEDED, const void *p UNNEED
/* Generated stub for fromwire_hsmd_new_channel_reply */
bool fromwire_hsmd_new_channel_reply(const void *p UNNEEDED)
{ fprintf(stderr, "fromwire_hsmd_new_channel_reply called!\n"); abort(); }
/* Generated stub for fromwire_hsmd_forget_channel_reply */
bool fromwire_hsmd_forget_channel_reply(const void *p UNNEEDED)
{ fprintf(stderr, "fromwire_hsmd_forget_channel_reply called!\n"); abort(); }
/* Generated stub for fromwire_hsmd_sign_commitment_tx_reply */
bool fromwire_hsmd_sign_commitment_tx_reply(const void *p UNNEEDED, struct bitcoin_signature *sig UNNEEDED)
{ fprintf(stderr, "fromwire_hsmd_sign_commitment_tx_reply called!\n"); abort(); }
Expand Down Expand Up @@ -1008,6 +1011,9 @@ u8 *towire_hsmd_init(const tal_t *ctx UNNEEDED, const struct bip32_key_version *
/* Generated stub for towire_hsmd_new_channel */
u8 *towire_hsmd_new_channel(const tal_t *ctx UNNEEDED, const struct node_id *id UNNEEDED, u64 dbid UNNEEDED)
{ fprintf(stderr, "towire_hsmd_new_channel called!\n"); abort(); }
/* Generated stub for towire_hsmd_forget_channel */
u8 *towire_hsmd_forget_channel(const tal_t *ctx UNNEEDED, const struct node_id *id UNNEEDED, u64 dbid UNNEEDED)
{ fprintf(stderr, "towire_hsmd_forget_channel called!\n"); abort(); }
/* Generated stub for towire_hsmd_sign_commitment_tx */
u8 *towire_hsmd_sign_commitment_tx(const tal_t *ctx UNNEEDED, const struct node_id *peer_id UNNEEDED, u64 channel_dbid UNNEEDED, const struct bitcoin_tx *tx UNNEEDED, const struct pubkey *remote_funding_key UNNEEDED, u64 commit_num UNNEEDED)
{ fprintf(stderr, "towire_hsmd_sign_commitment_tx called!\n"); abort(); }
Expand Down