Skip to content

Commit

Permalink
Merge pull request #2899 from buildkite/artifacts-and-timeouts
Browse files Browse the repository at this point in the history
Let artifact phase and post-command run in grace period
  • Loading branch information
DrJosh9000 authored Jul 29, 2024
2 parents dedc2eb + 0366b91 commit adb7e8a
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 56 deletions.
23 changes: 3 additions & 20 deletions clicommand/agent_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -843,28 +843,11 @@ var AgentStartCommand = cli.Command{
}
}

// Treat a negative signal grace period as relative to the cancel grace period
if cfg.SignalGracePeriodSeconds < 0 {
if cfg.CancelGracePeriod < -cfg.SignalGracePeriodSeconds {
return fmt.Errorf(
"cancel-grace-period (%d) must be at least as big as signal-grace-period-seconds (%d)",
cfg.CancelGracePeriod,
cfg.SignalGracePeriodSeconds,
)
}
cfg.SignalGracePeriodSeconds = cfg.CancelGracePeriod + cfg.SignalGracePeriodSeconds
}

if cfg.CancelGracePeriod <= cfg.SignalGracePeriodSeconds {
return fmt.Errorf(
"cancel-grace-period (%d) must be greater than signal-grace-period-seconds (%d)",
cfg.CancelGracePeriod,
cfg.SignalGracePeriodSeconds,
)
signalGracePeriod, err := signalGracePeriod(cfg.CancelGracePeriod, cfg.SignalGracePeriodSeconds)
if err != nil {
return err
}

signalGracePeriod := time.Duration(cfg.SignalGracePeriodSeconds) * time.Second

mc := metrics.NewCollector(l, metrics.CollectorConfig{
Datadog: cfg.MetricsDatadog,
DatadogHost: cfg.MetricsDatadogHost,
Expand Down
8 changes: 6 additions & 2 deletions clicommand/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"runtime"
"sync"
"syscall"
"time"

"github.com/buildkite/agent/v3/internal/job"
"github.com/buildkite/agent/v3/process"
Expand Down Expand Up @@ -91,6 +90,7 @@ type BootstrapConfig struct {
Phases []string `cli:"phases" normalize:"list"`
Profile string `cli:"profile"`
CancelSignal string `cli:"cancel-signal"`
CancelGracePeriod int `cli:"cancel-grace-period"`
SignalGracePeriodSeconds int `cli:"signal-grace-period-seconds"`
RedactedVars []string `cli:"redacted-vars" normalize:"list"`
TracingBackend string `cli:"tracing-backend"`
Expand Down Expand Up @@ -371,6 +371,7 @@ var BootstrapCommand = cli.Command{
EnvVar: "BUILDKITE_CONTAINER_ID",
},
cancelSignalFlag,
cancelGracePeriodFlag,
signalGracePeriodSecondsFlag,

// Global flags
Expand Down Expand Up @@ -408,7 +409,10 @@ var BootstrapCommand = cli.Command{
return fmt.Errorf("failed to parse cancel-signal: %w", err)
}

signalGracePeriod := time.Duration(cfg.SignalGracePeriodSeconds) * time.Second
signalGracePeriod, err := signalGracePeriod(cfg.CancelGracePeriod, cfg.SignalGracePeriodSeconds)
if err != nil {
return err
}

// Configure the bootstraper
bootstrap := job.New(job.ExecutorConfig{
Expand Down
37 changes: 36 additions & 1 deletion clicommand/cancel_signal.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
package clicommand

import "github.com/urfave/cli"
import (
"fmt"
"time"

"github.com/urfave/cli"
)

const (
defaultCancelGracePeriod = 10
Expand Down Expand Up @@ -30,3 +35,33 @@ var (
Value: -1,
}
)

// signalGracePeriod computes the signal grace period based on the various
// possible flag configurations:
// - If signalGracePeriodSecs is negative, it is relative to
// cancelGracePeriodSecs.
// - If cancelGracePeriodSecs is less than signalGracePeriodSecs that is an
// error.
func signalGracePeriod(cancelGracePeriodSecs, signalGracePeriodSecs int) (time.Duration, error) {
// Treat a negative signal grace period as relative to the cancel grace period
if signalGracePeriodSecs < 0 {
if cancelGracePeriodSecs < -signalGracePeriodSecs {
return 0, fmt.Errorf(
"cancel-grace-period (%d) must be at least as big as signal-grace-period-seconds (%d)",
cancelGracePeriodSecs,
signalGracePeriodSecs,
)
}
signalGracePeriodSecs = cancelGracePeriodSecs + signalGracePeriodSecs
}

if cancelGracePeriodSecs <= signalGracePeriodSecs {
return 0, fmt.Errorf(
"cancel-grace-period (%d) must be greater than signal-grace-period-seconds (%d)",
cancelGracePeriodSecs,
signalGracePeriodSecs,
)
}

return time.Duration(signalGracePeriodSecs) * time.Second, nil
}
61 changes: 28 additions & 33 deletions internal/job/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,10 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
defer stopper()
defer func() { span.FinishWithError(err) }()

// kind of a weird series of events - wrap a context in a cancellation (at the top of the func), then pull the cancellation
// out again, but some of the stuff we need to do in the executor (namely the teardown) needs to be able to "survive"
// a cancellation so we need to be able to pass a cancellable context to the non-teardown stuff, and an uncancellable
// context to the teardown stuff
nonCancelCtx := context.WithoutCancel(ctx)

// Listen for cancellation
// Listen for cancellation. Once ctx is cancelled, some tasks can run
// afterwards during the signal grace period. These use graceCtx.
graceCtx, graceCancel := withGracePeriod(ctx, e.SignalGracePeriod)
defer graceCancel()
go func() {
<-e.cancelCh
e.shell.Commentf("Received cancellation signal, interrupting")
Expand All @@ -151,7 +148,9 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {

// Tear down the environment (and fire pre-exit hook) before we exit
defer func() {
if err = e.tearDown(nonCancelCtx); err != nil {
// We strive to let the executor tear-down happen whether or not the job
// (and thus ctx) is cancelled, so it can run during the grace period.
if err := e.tearDown(graceCtx); err != nil {
e.shell.Errorf("Error tearing down job executor: %v", err)

// this gets passed back via the named return
Expand Down Expand Up @@ -237,8 +236,10 @@ func (e *Executor) Run(ctx context.Context) (exitCode int) {
span.RecordError(commandErr)
}

// Only upload artifacts as part of the command phase
if err = e.artifactPhase(ctx); err != nil {
// Only upload artifacts as part of the command phase.
// The artifacts might be relevant for debugging job timeouts, so it can
// run during the grace period.
if err := e.artifactPhase(graceCtx); err != nil {
e.shell.Errorf("%v", err)

if commandErr != nil {
Expand Down Expand Up @@ -831,58 +832,48 @@ func (e *Executor) tearDown(ctx context.Context) error {
}

// runPreCommandHooks runs the pre-command hooks and adds tracing spans.
func (e *Executor) runPreCommandHooks(ctx context.Context) error {
func (e *Executor) runPreCommandHooks(ctx context.Context) (err error) {
spanName := e.implementationSpecificSpanName("pre-command", "pre-command hooks")
span, ctx := tracetools.StartSpanFromContext(ctx, spanName, e.ExecutorConfig.TracingBackend)
var err error
defer func() { span.FinishWithError(err) }()

if err = e.executeGlobalHook(ctx, "pre-command"); err != nil {
return err
}
if err = e.executeLocalHook(ctx, "pre-command"); err != nil {
if err := e.executeGlobalHook(ctx, "pre-command"); err != nil {
return err
}
if err = e.executePluginHook(ctx, "pre-command", e.pluginCheckouts); err != nil {
if err := e.executeLocalHook(ctx, "pre-command"); err != nil {
return err
}
return nil
return e.executePluginHook(ctx, "pre-command", e.pluginCheckouts)
}

// runCommand runs the command and adds tracing spans.
func (e *Executor) runCommand(ctx context.Context) error {
var err error
// There can only be one command hook, so we check them in order of plugin, local
switch {
case e.hasPluginHook("command"):
err = e.executePluginHook(ctx, "command", e.pluginCheckouts)
return e.executePluginHook(ctx, "command", e.pluginCheckouts)
case e.hasLocalHook("command"):
err = e.executeLocalHook(ctx, "command")
return e.executeLocalHook(ctx, "command")
case e.hasGlobalHook("command"):
err = e.executeGlobalHook(ctx, "command")
return e.executeGlobalHook(ctx, "command")
default:
err = e.defaultCommandPhase(ctx)
return e.defaultCommandPhase(ctx)
}
return err
}

// runPostCommandHooks runs the post-command hooks and adds tracing spans.
func (e *Executor) runPostCommandHooks(ctx context.Context) error {
func (e *Executor) runPostCommandHooks(ctx context.Context) (err error) {
spanName := e.implementationSpecificSpanName("post-command", "post-command hooks")
span, ctx := tracetools.StartSpanFromContext(ctx, spanName, e.ExecutorConfig.TracingBackend)
var err error
defer func() { span.FinishWithError(err) }()

if err = e.executeGlobalHook(ctx, "post-command"); err != nil {
if err := e.executeGlobalHook(ctx, "post-command"); err != nil {
return err
}
if err = e.executeLocalHook(ctx, "post-command"); err != nil {
if err := e.executeLocalHook(ctx, "post-command"); err != nil {
return err
}
if err = e.executePluginHook(ctx, "post-command", e.pluginCheckouts); err != nil {
return err
}
return nil
return e.executePluginHook(ctx, "post-command", e.pluginCheckouts)
}

// CommandPhase determines how to run the build, and then runs it
Expand All @@ -900,7 +891,11 @@ func (e *Executor) CommandPhase(ctx context.Context) (hookErr error, commandErr
if preCommandErr != nil {
return
}
hookErr = e.runPostCommandHooks(ctx)
// Because post-command hooks are often used for post-job cleanup, they
// can run during the grace period.
graceCtx, cancel := withGracePeriod(ctx, e.SignalGracePeriod)
defer cancel()
hookErr = e.runPostCommandHooks(graceCtx)
}()

// Run pre-command hooks
Expand Down
27 changes: 27 additions & 0 deletions internal/job/grace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package job

import (
"context"
"time"
)

// withGracePeriod returns a context that is cancelled some time *after* the
// parent context is cancelled. In general this is not a good pattern, since it
// breaks the usual connection between context cancellations and requires an
// extra goroutine. However, we need to enforce the signal grace period from
// within the job executor for use-cases where the executor is _not_ forked from
// something else that will enforce the grace period (with SIGKILL).
func withGracePeriod(ctx context.Context, graceTimeout time.Duration) (context.Context, context.CancelFunc) {
gctx, cancel := context.WithCancelCause(context.WithoutCancel(ctx))
go func() {
<-ctx.Done()
select {
case <-time.After(graceTimeout):
cancel(context.DeadlineExceeded)

case <-gctx.Done():
// This can happen if the caller called the cancel func.
}
}()
return gctx, func() { cancel(context.Canceled) }
}

0 comments on commit adb7e8a

Please sign in to comment.