Skip to content

Commit 52e1b62

Browse files
committed
discovery: fix endless loop in gossip syncer on context cancellation
This commit fixes a critical bug where the channelGraphSyncer goroutine would enter an endless loop when context cancellation or peer disconnect errors occurred during the syncingChans or queryNewChannels states. The root cause was that state handler functions (handleSyncingChans and synchronizeChanIDs) did not return errors to the main goroutine loop. When these functions encountered fatal errors like context cancellation, they would log the error and return early without changing the syncer's state. This caused the main loop to immediately re-enter the same state handler, encounter the same error, and loop indefinitely while spamming error logs. The fix makes error handling explicit by having state handlers return errors. The main channelGraphSyncer loop now checks these errors and exits cleanly when fatal errors occur. We return any error (not just context cancellation) because fatal errors can manifest in multiple forms: context.Canceled, ErrGossipSyncerExiting from the rate limiter, lnpeer.ErrPeerExiting from Brontide, or network errors like connection closed. This approach matches the error handling pattern already used in other goroutines like replyHandler.
1 parent f938e40 commit 52e1b62

File tree

2 files changed

+43
-12
lines changed

2 files changed

+43
-12
lines changed

discovery/syncer.go

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -492,15 +492,19 @@ func (g *GossipSyncer) Stop() {
492492

493493
// handleSyncingChans handles the state syncingChans for the GossipSyncer. When
494494
// in this state, we will send a QueryChannelRange msg to our peer and advance
495-
// the syncer's state to waitingQueryRangeReply.
496-
func (g *GossipSyncer) handleSyncingChans(ctx context.Context) {
495+
// the syncer's state to waitingQueryRangeReply. Returns an error if a fatal
496+
// error occurs that should cause the goroutine to exit.
497+
func (g *GossipSyncer) handleSyncingChans(ctx context.Context) error {
497498
// Prepare the query msg.
498499
queryRangeMsg, err := g.genChanRangeQuery(
499500
ctx, g.genHistoricalChanRangeQuery,
500501
)
501502
if err != nil {
502503
log.Errorf("Unable to gen chan range query: %v", err)
503-
return
504+
505+
// Any error here is likely fatal (context cancelled, db error,
506+
// etc.), so return it to exit the goroutine cleanly.
507+
return err
504508
}
505509

506510
// Acquire a lock so the following state transition is atomic.
@@ -517,12 +521,18 @@ func (g *GossipSyncer) handleSyncingChans(ctx context.Context) {
517521
err = g.sendToPeer(ctx, queryRangeMsg)
518522
if err != nil {
519523
log.Errorf("Unable to send chan range query: %v", err)
520-
return
524+
525+
// Any send error (peer exiting, connection closed, rate
526+
// limiter signaling exit, etc.) is fatal, so return it to
527+
// exit the goroutine cleanly.
528+
return err
521529
}
522530

523531
// With the message sent successfully, we'll transition into the next
524532
// state where we wait for their reply.
525533
g.setSyncState(waitingQueryRangeReply)
534+
535+
return nil
526536
}
527537

528538
// channelGraphSyncer is the main goroutine responsible for ensuring that we
@@ -545,7 +555,13 @@ func (g *GossipSyncer) channelGraphSyncer(ctx context.Context) {
545555
// understand, as we'll as responding to any other queries by
546556
// them.
547557
case syncingChans:
548-
g.handleSyncingChans(ctx)
558+
err := g.handleSyncingChans(ctx)
559+
if err != nil {
560+
log.Debugf("GossipSyncer(%x): exiting due to "+
561+
"error in syncingChans: %v",
562+
g.cfg.peerPub[:], err)
563+
return
564+
}
549565

550566
// In this state, we've sent out our initial channel range
551567
// query and are waiting for the final response from the remote
@@ -593,7 +609,13 @@ func (g *GossipSyncer) channelGraphSyncer(ctx context.Context) {
593609
// First, we'll attempt to continue our channel
594610
// synchronization by continuing to send off another
595611
// query chunk.
596-
done := g.synchronizeChanIDs(ctx)
612+
done, err := g.synchronizeChanIDs(ctx)
613+
if err != nil {
614+
log.Debugf("GossipSyncer(%x): exiting due to "+
615+
"error in queryNewChannels: %v",
616+
g.cfg.peerPub[:], err)
617+
return
618+
}
597619

598620
// If this wasn't our last query, then we'll need to
599621
// transition to our waiting state.
@@ -819,16 +841,18 @@ func (g *GossipSyncer) sendGossipTimestampRange(ctx context.Context,
819841
// range. This method will be called continually until the entire range has
820842
// been queried for with a response received. We'll chunk our requests as
821843
// required to ensure they fit into a single message. We may re-renter this
822-
// state in the case that chunking is required.
823-
func (g *GossipSyncer) synchronizeChanIDs(ctx context.Context) bool {
844+
// state in the case that chunking is required. Returns true if synchronization
845+
// is complete, and an error if a fatal error occurs that should cause the
846+
// goroutine to exit.
847+
func (g *GossipSyncer) synchronizeChanIDs(ctx context.Context) (bool, error) {
824848
// If we're in this state yet there are no more new channels to query
825849
// for, then we'll transition to our final synced state and return true
826850
// to signal that we're fully synchronized.
827851
if len(g.newChansToQuery) == 0 {
828852
log.Infof("GossipSyncer(%x): no more chans to query",
829853
g.cfg.peerPub[:])
830854

831-
return true
855+
return true, nil
832856
}
833857

834858
// Otherwise, we'll issue our next chunked query to receive replies
@@ -864,9 +888,14 @@ func (g *GossipSyncer) synchronizeChanIDs(ctx context.Context) bool {
864888
})
865889
if err != nil {
866890
log.Errorf("Unable to sync chan IDs: %v", err)
891+
892+
// Any send error (peer exiting, connection closed, rate
893+
// limiter signaling exit, etc.) is fatal, so return it to
894+
// exit the goroutine cleanly.
895+
return false, err
867896
}
868897

869-
return false
898+
return false, nil
870899
}
871900

872901
// isLegacyReplyChannelRange determines where a ReplyChannelRange message is

discovery/syncer_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,7 +1518,8 @@ func TestGossipSyncerSynchronizeChanIDs(t *testing.T) {
15181518

15191519
for i := 0; i < chunkSize*2; i += 2 {
15201520
// With our set up complete, we'll request a sync of chan ID's.
1521-
done := syncer.synchronizeChanIDs(t.Context())
1521+
done, err := syncer.synchronizeChanIDs(t.Context())
1522+
require.NoError(t, err)
15221523

15231524
// At this point, we shouldn't yet be done as only 2 items
15241525
// should have been queried for.
@@ -1565,7 +1566,8 @@ func TestGossipSyncerSynchronizeChanIDs(t *testing.T) {
15651566
}
15661567

15671568
// If we issue another query, the syncer should tell us that it's done.
1568-
done := syncer.synchronizeChanIDs(t.Context())
1569+
done, err := syncer.synchronizeChanIDs(t.Context())
1570+
require.NoError(t, err)
15691571
if done {
15701572
t.Fatalf("syncer should be finished!")
15711573
}

0 commit comments

Comments
 (0)