Skip to content

Commit

Permalink
Handle shutdown goroutine panic recovery differently
Browse files Browse the repository at this point in the history
  • Loading branch information
RebeccaMahany committed Jan 8, 2025
1 parent 507810a commit f665c36
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 16 deletions.
45 changes: 32 additions & 13 deletions ee/errgroup/errgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,29 +119,48 @@ func (l *LoggedErrgroup) StartRepeatedGoroutine(ctx context.Context, goroutineNa
// AddShutdownGoroutine adds the given goroutine to the errgroup, ensuring that we log its start and exit.
// The goroutine will not execute until the errgroup has received a signal to exit.
func (l *LoggedErrgroup) AddShutdownGoroutine(ctx context.Context, goroutineName string, goroutine func() error) {
l.StartGoroutine(ctx, goroutineName, func() error {
l.errgroup.Go(func() error {
slogger := l.slogger.With("goroutine_name", goroutineName)

// Catch any panicking goroutines and log them. We do not want to return
// the error from this routine, as we do for StartGoroutine and StartRepeatedGoroutine --
// shutdown goroutines should not return an error besides the errgroup's initial error.
defer func() {
if r := recover(); r != nil {
slogger.Log(ctx, slog.LevelError,
"panic occurred in shutdown goroutine",
"err", r,
)
if err, ok := r.(error); ok {
slogger.Log(ctx, slog.LevelError,
"panic stack trace",
"stack_trace", fmt.Sprintf("%+v", errors.WithStack(err)),
)
}
}
}()

// Wait for errgroup to exit
<-l.doneCtx.Done()

slogger.Log(ctx, slog.LevelInfo,
"starting shutdown goroutine in errgroup",
)

goroutineStart := time.Now()
err := goroutine()
elapsedTime := time.Since(goroutineStart)

// Log anything amiss about the shutdown goroutine -- did it return an error? Did it take too long?
if err != nil {
slogger.Log(ctx, slog.LevelWarn,
"shutdown routine returned err",
"goroutine_run_time", elapsedTime.String(),
"goroutine_err", err,
)
} else if elapsedTime > maxShutdownGoroutineDuration {
slogger.Log(ctx, slog.LevelWarn,
"noticed slow shutdown routine",
"goroutine_run_time", elapsedTime.String(),
)
logLevel := slog.LevelInfo
if elapsedTime > maxShutdownGoroutineDuration || err != nil {
logLevel = slog.LevelWarn
}
slogger.Log(ctx, logLevel,
"exiting shutdown goroutine in errgroup",
"goroutine_name", goroutineName,
"goroutine_run_time", elapsedTime.String(),
"goroutine_err", err,
)

// We don't want to actually return the error here, to avoid causing an otherwise successful call
// to `Shutdown` => `Wait` to return an error. Shutdown routine errors don't matter for the success
Expand Down
64 changes: 61 additions & 3 deletions ee/errgroup/errgroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func TestShutdown(t *testing.T) {
require.True(t, canceled, "errgroup did not exit")
}

func Test_HandlesPanic(t *testing.T) {
func TestStartGoroutine_HandlesPanic(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithCancel(context.TODO())
Expand All @@ -116,8 +116,36 @@ func Test_HandlesPanic(t *testing.T) {
return nil
})

// We expect that the errgroup shuts down -- the test should not panic
eg.Shutdown()
// We expect that the errgroup shuts itself down -- the test should not panic
require.Error(t, eg.Wait(), "should have returned error from panicking goroutine")
canceled := false
select {
case <-eg.Exited():
canceled = true
default:
}

require.True(t, canceled, "errgroup did not exit")
}

func TestStartRepeatedGoroutine_HandlesPanic(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithCancel(context.TODO())
defer cancel()

eg := NewLoggedErrgroup(ctx, multislogger.NewNopLogger())

eg.StartRepeatedGoroutine(ctx, "test_goroutine", 100*time.Millisecond, 50*time.Millisecond, func() error {
testArr := make([]int, 0)
fmt.Println(testArr[2]) // cause out-of-bounds panic
return nil
})

// Wait for long enough that the repeated goroutine executes at least once
time.Sleep(500 * time.Millisecond)

// We expect that the errgroup shuts itself down -- the test should not panic
require.Error(t, eg.Wait(), "should have returned error from panicking goroutine")
canceled := false
select {
Expand All @@ -128,3 +156,33 @@ func Test_HandlesPanic(t *testing.T) {

require.True(t, canceled, "errgroup did not exit")
}

func TestAddShutdownGoroutine_HandlesPanic(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithCancel(context.TODO())
defer cancel()

eg := NewLoggedErrgroup(ctx, multislogger.NewNopLogger())

eg.AddShutdownGoroutine(ctx, "test_goroutine", func() error {
testArr := make([]int, 0)
fmt.Println(testArr[2]) // cause out-of-bounds panic
return nil
})

// Call shutdown so the shutdown goroutine runs and the errgroup returns.
eg.Shutdown()

// We expect that the errgroup shuts itself down -- the test should not panic.
// Since we called `Shutdown`, `Wait` should not return an error.
require.Nil(t, eg.Wait(), "should not returned error after call to Shutdown")
canceled := false
select {
case <-eg.Exited():
canceled = true
default:
}

require.True(t, canceled, "errgroup did not exit")
}

0 comments on commit f665c36

Please sign in to comment.