From c06cc2244f0f336b899df8760afaa4d6b5db99d7 Mon Sep 17 00:00:00 2001 From: Josh Deprez Date: Thu, 25 Jul 2024 17:55:44 +1000 Subject: [PATCH 1/6] Enforce grace period within executor --- internal/job/executor.go | 14 ++++++-------- internal/job/grace.go | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 8 deletions(-) create mode 100644 internal/job/grace.go diff --git a/internal/job/executor.go b/internal/job/executor.go index 5d08c6c3f8..63d3895c7b 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -121,13 +121,9 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { defer stopper() defer func() { span.FinishWithError(err) }() - // kind of a weird series of events - wrap a context in a cancellation (at the top of the func), then pull the cancellation - // out again, but some of the stuff we need to do in the executor (namely the teardown) needs to be able to "survive" - // a cancellation so we need to be able to pass a cancellable context to the non-teardown stuff, and an uncancellable - // context to the teardown stuff - nonCancelCtx := context.WithoutCancel(ctx) - - // Listen for cancellation + // Listen for cancellation. Once ctx is cancelled, some tasks can run + // afterwards during the signal grace period. These use graceCtx. + graceCtx := withGracePeriod(ctx, e.SignalGracePeriod) go func() { <-e.cancelCh e.shell.Commentf("Received cancellation signal, interrupting") @@ -151,7 +147,9 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { // Tear down the environment (and fire pre-exit hook) before we exit defer func() { - if err = e.tearDown(nonCancelCtx); err != nil { + // We strive to let the executor tear-down happen whether or not the job + // (and thus ctx) is cancelled, so it can run during the grace period. + if err := e.tearDown(graceCtx); err != nil { e.shell.Errorf("Error tearing down job executor: %v", err) // this gets passed back via the named return diff --git a/internal/job/grace.go b/internal/job/grace.go new file mode 100644 index 0000000000..02bfa92f25 --- /dev/null +++ b/internal/job/grace.go @@ -0,0 +1,22 @@ +package job + +import ( + "context" + "time" +) + +// withGracePeriod returns a context that is cancelled some time *after* the +// parent context is cancelled. In general this is not a good pattern, since it +// breaks the usual connection between context cancellations and requires an +// extra goroutine. However, we need to enforce the signal grace period from +// within the job executor for use-cases where the executor is _not_ forked from +// something else that will enforce the grace period (with SIGKILL). +func withGracePeriod(ctx context.Context, graceTimeout time.Duration) context.Context { + gctx, cancel := context.WithCancelCause(context.WithoutCancel(ctx)) + go func() { + <-ctx.Done() + time.Sleep(graceTimeout) + cancel(context.DeadlineExceeded) + }() + return gctx +} From e540e178d9c4fefb4ffcb112c2bb69ce5646165e Mon Sep 17 00:00:00 2001 From: Josh Deprez Date: Mon, 29 Jul 2024 14:02:31 +1000 Subject: [PATCH 2/6] Run artifact phase in grace period --- internal/job/executor.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/job/executor.go b/internal/job/executor.go index 63d3895c7b..bb5d0b47aa 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -235,8 +235,10 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { span.RecordError(commandErr) } - // Only upload artifacts as part of the command phase - if err = e.artifactPhase(ctx); err != nil { + // Only upload artifacts as part of the command phase. + // The artifacts might be relevant for debugging job timeouts, so it can + // run during the grace period. + if err := e.artifactPhase(graceCtx); err != nil { e.shell.Errorf("%v", err) if commandErr != nil { From 39b5081d028c30d795d589cd45986feb47a9d5d0 Mon Sep 17 00:00:00 2001 From: Josh Deprez Date: Mon, 29 Jul 2024 13:29:58 +1000 Subject: [PATCH 3/6] Run post-command hooks in grace period --- internal/job/executor.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/job/executor.go b/internal/job/executor.go index bb5d0b47aa..8540877322 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -900,7 +900,9 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr error, commandErr if preCommandErr != nil { return } - hookErr = e.runPostCommandHooks(ctx) + // Because post-command hooks are often used for post-job cleanup, they + // can run during the grace period. + hookErr = e.runPostCommandHooks(withGracePeriod(ctx, e.SignalGracePeriod)) }() // Run pre-command hooks From 2d0fcc33154953e65a11aefbe3805b251fd83bf0 Mon Sep 17 00:00:00 2001 From: Josh Deprez Date: Mon, 29 Jul 2024 13:43:26 +1000 Subject: [PATCH 4/6] Minor error tweaks in command phase --- internal/job/executor.go | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/internal/job/executor.go b/internal/job/executor.go index 8540877322..eb4c0b8917 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -831,58 +831,48 @@ func (e *Executor) tearDown(ctx context.Context) error { } // runPreCommandHooks runs the pre-command hooks and adds tracing spans. -func (e *Executor) runPreCommandHooks(ctx context.Context) error { +func (e *Executor) runPreCommandHooks(ctx context.Context) (err error) { spanName := e.implementationSpecificSpanName("pre-command", "pre-command hooks") span, ctx := tracetools.StartSpanFromContext(ctx, spanName, e.ExecutorConfig.TracingBackend) - var err error defer func() { span.FinishWithError(err) }() - if err = e.executeGlobalHook(ctx, "pre-command"); err != nil { - return err - } - if err = e.executeLocalHook(ctx, "pre-command"); err != nil { + if err := e.executeGlobalHook(ctx, "pre-command"); err != nil { return err } - if err = e.executePluginHook(ctx, "pre-command", e.pluginCheckouts); err != nil { + if err := e.executeLocalHook(ctx, "pre-command"); err != nil { return err } - return nil + return e.executePluginHook(ctx, "pre-command", e.pluginCheckouts) } // runCommand runs the command and adds tracing spans. func (e *Executor) runCommand(ctx context.Context) error { - var err error // There can only be one command hook, so we check them in order of plugin, local switch { case e.hasPluginHook("command"): - err = e.executePluginHook(ctx, "command", e.pluginCheckouts) + return e.executePluginHook(ctx, "command", e.pluginCheckouts) case e.hasLocalHook("command"): - err = e.executeLocalHook(ctx, "command") + return e.executeLocalHook(ctx, "command") case e.hasGlobalHook("command"): - err = e.executeGlobalHook(ctx, "command") + return e.executeGlobalHook(ctx, "command") default: - err = e.defaultCommandPhase(ctx) + return e.defaultCommandPhase(ctx) } - return err } // runPostCommandHooks runs the post-command hooks and adds tracing spans. -func (e *Executor) runPostCommandHooks(ctx context.Context) error { +func (e *Executor) runPostCommandHooks(ctx context.Context) (err error) { spanName := e.implementationSpecificSpanName("post-command", "post-command hooks") span, ctx := tracetools.StartSpanFromContext(ctx, spanName, e.ExecutorConfig.TracingBackend) - var err error defer func() { span.FinishWithError(err) }() - if err = e.executeGlobalHook(ctx, "post-command"); err != nil { + if err := e.executeGlobalHook(ctx, "post-command"); err != nil { return err } - if err = e.executeLocalHook(ctx, "post-command"); err != nil { + if err := e.executeLocalHook(ctx, "post-command"); err != nil { return err } - if err = e.executePluginHook(ctx, "post-command", e.pluginCheckouts); err != nil { - return err - } - return nil + return e.executePluginHook(ctx, "post-command", e.pluginCheckouts) } // CommandPhase determines how to run the build, and then runs it From 53a3a89448ef2f8258360bd5a5bf54d371621d56 Mon Sep 17 00:00:00 2001 From: Josh Deprez Date: Mon, 29 Jul 2024 14:28:59 +1000 Subject: [PATCH 5/6] Do same signalGracePeriod calc in bootstrap --- clicommand/agent_start.go | 23 +++-------------------- clicommand/bootstrap.go | 8 ++++++-- clicommand/cancel_signal.go | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 23 deletions(-) diff --git a/clicommand/agent_start.go b/clicommand/agent_start.go index 6434c80f13..41c85f4af4 100644 --- a/clicommand/agent_start.go +++ b/clicommand/agent_start.go @@ -843,28 +843,11 @@ var AgentStartCommand = cli.Command{ } } - // Treat a negative signal grace period as relative to the cancel grace period - if cfg.SignalGracePeriodSeconds < 0 { - if cfg.CancelGracePeriod < -cfg.SignalGracePeriodSeconds { - return fmt.Errorf( - "cancel-grace-period (%d) must be at least as big as signal-grace-period-seconds (%d)", - cfg.CancelGracePeriod, - cfg.SignalGracePeriodSeconds, - ) - } - cfg.SignalGracePeriodSeconds = cfg.CancelGracePeriod + cfg.SignalGracePeriodSeconds - } - - if cfg.CancelGracePeriod <= cfg.SignalGracePeriodSeconds { - return fmt.Errorf( - "cancel-grace-period (%d) must be greater than signal-grace-period-seconds (%d)", - cfg.CancelGracePeriod, - cfg.SignalGracePeriodSeconds, - ) + signalGracePeriod, err := signalGracePeriod(cfg.CancelGracePeriod, cfg.SignalGracePeriodSeconds) + if err != nil { + return err } - signalGracePeriod := time.Duration(cfg.SignalGracePeriodSeconds) * time.Second - mc := metrics.NewCollector(l, metrics.CollectorConfig{ Datadog: cfg.MetricsDatadog, DatadogHost: cfg.MetricsDatadogHost, diff --git a/clicommand/bootstrap.go b/clicommand/bootstrap.go index 5b4e795e65..65bc0719fd 100644 --- a/clicommand/bootstrap.go +++ b/clicommand/bootstrap.go @@ -8,7 +8,6 @@ import ( "runtime" "sync" "syscall" - "time" "github.com/buildkite/agent/v3/internal/job" "github.com/buildkite/agent/v3/process" @@ -91,6 +90,7 @@ type BootstrapConfig struct { Phases []string `cli:"phases" normalize:"list"` Profile string `cli:"profile"` CancelSignal string `cli:"cancel-signal"` + CancelGracePeriod int `cli:"cancel-grace-period"` SignalGracePeriodSeconds int `cli:"signal-grace-period-seconds"` RedactedVars []string `cli:"redacted-vars" normalize:"list"` TracingBackend string `cli:"tracing-backend"` @@ -371,6 +371,7 @@ var BootstrapCommand = cli.Command{ EnvVar: "BUILDKITE_CONTAINER_ID", }, cancelSignalFlag, + cancelGracePeriodFlag, signalGracePeriodSecondsFlag, // Global flags @@ -408,7 +409,10 @@ var BootstrapCommand = cli.Command{ return fmt.Errorf("failed to parse cancel-signal: %w", err) } - signalGracePeriod := time.Duration(cfg.SignalGracePeriodSeconds) * time.Second + signalGracePeriod, err := signalGracePeriod(cfg.CancelGracePeriod, cfg.SignalGracePeriodSeconds) + if err != nil { + return err + } // Configure the bootstraper bootstrap := job.New(job.ExecutorConfig{ diff --git a/clicommand/cancel_signal.go b/clicommand/cancel_signal.go index f7edac5362..c273ed98db 100644 --- a/clicommand/cancel_signal.go +++ b/clicommand/cancel_signal.go @@ -1,6 +1,11 @@ package clicommand -import "github.com/urfave/cli" +import ( + "fmt" + "time" + + "github.com/urfave/cli" +) const ( defaultCancelGracePeriod = 10 @@ -30,3 +35,33 @@ var ( Value: -1, } ) + +// signalGracePeriod computes the signal grace period based on the various +// possible flag configurations: +// - If signalGracePeriodSecs is negative, it is relative to +// cancelGracePeriodSecs. +// - If cancelGracePeriodSecs is less than signalGracePeriodSecs that is an +// error. +func signalGracePeriod(cancelGracePeriodSecs, signalGracePeriodSecs int) (time.Duration, error) { + // Treat a negative signal grace period as relative to the cancel grace period + if signalGracePeriodSecs < 0 { + if cancelGracePeriodSecs < -signalGracePeriodSecs { + return 0, fmt.Errorf( + "cancel-grace-period (%d) must be at least as big as signal-grace-period-seconds (%d)", + cancelGracePeriodSecs, + signalGracePeriodSecs, + ) + } + signalGracePeriodSecs = cancelGracePeriodSecs + signalGracePeriodSecs + } + + if cancelGracePeriodSecs <= signalGracePeriodSecs { + return 0, fmt.Errorf( + "cancel-grace-period (%d) must be greater than signal-grace-period-seconds (%d)", + cancelGracePeriodSecs, + signalGracePeriodSecs, + ) + } + + return time.Duration(signalGracePeriodSecs) * time.Second, nil +} From 0366b91599fc95d233266c6d11c77ebe6f8994e5 Mon Sep 17 00:00:00 2001 From: Josh Deprez Date: Mon, 29 Jul 2024 15:04:59 +1000 Subject: [PATCH 6/6] Add cancel callback to withGracePeriod --- internal/job/executor.go | 7 +++++-- internal/job/grace.go | 13 +++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/internal/job/executor.go b/internal/job/executor.go index eb4c0b8917..bddabad2ce 100644 --- a/internal/job/executor.go +++ b/internal/job/executor.go @@ -123,7 +123,8 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) { // Listen for cancellation. Once ctx is cancelled, some tasks can run // afterwards during the signal grace period. These use graceCtx. - graceCtx := withGracePeriod(ctx, e.SignalGracePeriod) + graceCtx, graceCancel := withGracePeriod(ctx, e.SignalGracePeriod) + defer graceCancel() go func() { <-e.cancelCh e.shell.Commentf("Received cancellation signal, interrupting") @@ -892,7 +893,9 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr error, commandErr } // Because post-command hooks are often used for post-job cleanup, they // can run during the grace period. - hookErr = e.runPostCommandHooks(withGracePeriod(ctx, e.SignalGracePeriod)) + graceCtx, cancel := withGracePeriod(ctx, e.SignalGracePeriod) + defer cancel() + hookErr = e.runPostCommandHooks(graceCtx) }() // Run pre-command hooks diff --git a/internal/job/grace.go b/internal/job/grace.go index 02bfa92f25..141aef4fac 100644 --- a/internal/job/grace.go +++ b/internal/job/grace.go @@ -11,12 +11,17 @@ import ( // extra goroutine. However, we need to enforce the signal grace period from // within the job executor for use-cases where the executor is _not_ forked from // something else that will enforce the grace period (with SIGKILL). -func withGracePeriod(ctx context.Context, graceTimeout time.Duration) context.Context { +func withGracePeriod(ctx context.Context, graceTimeout time.Duration) (context.Context, context.CancelFunc) { gctx, cancel := context.WithCancelCause(context.WithoutCancel(ctx)) go func() { <-ctx.Done() - time.Sleep(graceTimeout) - cancel(context.DeadlineExceeded) + select { + case <-time.After(graceTimeout): + cancel(context.DeadlineExceeded) + + case <-gctx.Done(): + // This can happen if the caller called the cancel func. + } }() - return gctx + return gctx, func() { cancel(context.Canceled) } }