Skip to content

Commit

Permalink
profiler: suppress errors if the profiler is stopped (#2886)
Browse files Browse the repository at this point in the history
The fix in #2865 was intended to suppress spurious metrics profile
errors when the profiler is stopped. It did so by relaxing the
one-second duration constraint of the metrics profiler. However,
the Windows system timer resolution is about 15 milliseconds (see
https://learn.microsoft.com/en-us/windows-hardware/drivers/kernel/high-resolution-timers#controlling-timer-accuracy)
This caused the metrics profile tests from #2865 to fail because the
metrics profiler will likely be stopped in less than 15 milliseconds,
meaning we'll see 0 duration between profile collection and log an
error.

This commit actually suppresses the error by checking whether the
profiler was stopped (meaning interruptibleSleep was interrupted). If
so, and if the metrics profiler returned an error, we instead return a
sentinel error indicating that profiling was stopped. If we see that
error, we just drop the profile and don't log an error. We won't upload
the profile anyway. This way, we should only report an error from the
metrics profiler if there is _actually_ a problem with the timer.
  • Loading branch information
nsrip-dd authored Sep 25, 2024
1 parent eef52d3 commit ac73f9b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
5 changes: 4 additions & 1 deletion profiler/profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,11 @@ var profileTypes = map[ProfileType]profileType{
Filename: "metrics.json",
Collect: func(p *profiler) ([]byte, error) {
var buf bytes.Buffer
p.interruptibleSleep(p.cfg.period)
interrupted := p.interruptibleSleep(p.cfg.period)
err := p.met.report(now(), &buf)
if err != nil && interrupted {
err = errProfilerStopped
}
return buf.Bytes(), err
},
},
Expand Down
18 changes: 14 additions & 4 deletions profiler/profiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ var (
activeProfiler *profiler
containerID = internal.ContainerID() // replaced in tests
entityID = internal.EntityID() // replaced in tests

// errProfilerStopped is a sentinel for suppressng errors if we are
// about to stop the profiler
errProfilerStopped = errors.New("profiler stopped")
)

// Start starts the profiler. If the profiler is already running, it will be
Expand Down Expand Up @@ -343,9 +347,12 @@ func (p *profiler) collect(ticker <-chan time.Time) {
}
profs, err := p.runProfile(t)
if err != nil {
log.Error("Error getting %s profile: %v; skipping.", t, err)
tags := append(p.cfg.tags.Slice(), t.Tag())
p.cfg.statsd.Count("datadog.profiling.go.collect_error", 1, tags, 1)
if err != errProfilerStopped {
log.Error("Error getting %s profile: %v; skipping.", t, err)
tags := append(p.cfg.tags.Slice(), t.Tag())
p.cfg.statsd.Count("datadog.profiling.go.collect_error", 1, tags, 1)
}
return
}
mu.Lock()
defer mu.Unlock()
Expand Down Expand Up @@ -480,10 +487,13 @@ func (p *profiler) outputDir(bat batch) error {

// interruptibleSleep sleeps for the given duration or until interrupted by the
// p.exit channel being closed.
func (p *profiler) interruptibleSleep(d time.Duration) {
// Returns whether the sleep was interrupted
func (p *profiler) interruptibleSleep(d time.Duration) bool {
select {
case <-p.exit:
return true
case <-time.After(d):
return false
}
}

Expand Down

0 comments on commit ac73f9b

Please sign in to comment.