Skip to content

Commit

Permalink
Set logging to debug when health check fails
Browse files Browse the repository at this point in the history
Signed-off-by: Yacov Manevich <[email protected]>
  • Loading branch information
yacovm committed Feb 7, 2025
1 parent db66511 commit 20b9114
Show file tree
Hide file tree
Showing 14 changed files with 599 additions and 43 deletions.
7 changes: 7 additions & 0 deletions RELEASES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Release Notes

## [v1.12.3] (pending)

### Configs
- Added logging auto-amplification upon a health check failure
- `--log-auto-amplification`
- `--log-auto-amplification-max-duration`

## [v1.12.2](https://github.com/ava-labs/avalanchego/releases/tag/v1.12.2)

This version is backwards compatible to [v1.12.0](https://github.com/ava-labs/avalanchego/releases/tag/v1.12.0). It is optional, but encouraged.
Expand Down
12 changes: 8 additions & 4 deletions api/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,19 +65,20 @@ type health struct {
liveness *worker
}

func New(log logging.Logger, registerer prometheus.Registerer) (Health, error) {
func New(log logging.Logger, registerer prometheus.Registerer, reportUnhealthy func(bool)) (Health, error) {
failingChecks := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "checks_failing",
Help: "number of currently failing health checks",
},
[]string{CheckLabel, TagLabel},
)

return &health{
log: log,
readiness: newWorker(log, "readiness", failingChecks),
health: newWorker(log, "health", failingChecks),
liveness: newWorker(log, "liveness", failingChecks),
readiness: newWorker(log, "readiness", failingChecks, emptyReport),
health: newWorker(log, "health", failingChecks, reportUnhealthy),
liveness: newWorker(log, "liveness", failingChecks, emptyReport),
}, registerer.Register(failingChecks)
}

Expand Down Expand Up @@ -137,3 +138,6 @@ func (h *health) Stop() {
h.health.Stop()
h.liveness.Stop()
}

func emptyReport(bool) {
}
12 changes: 6 additions & 6 deletions api/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func TestDuplicatedRegistrations(t *testing.T) {
return "", nil
})

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), emptyReport)
require.NoError(err)

require.NoError(h.RegisterReadinessCheck("check", check))
Expand All @@ -77,7 +77,7 @@ func TestDefaultFailing(t *testing.T) {
return "", nil
})

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), emptyReport)
require.NoError(err)

{
Expand Down Expand Up @@ -118,7 +118,7 @@ func TestPassingChecks(t *testing.T) {
return "", nil
})

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), emptyReport)
require.NoError(err)

require.NoError(h.RegisterReadinessCheck("check", check))
Expand Down Expand Up @@ -182,7 +182,7 @@ func TestPassingThenFailingChecks(t *testing.T) {
return "", nil
})

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), emptyReport)
require.NoError(err)

require.NoError(h.RegisterReadinessCheck("check", check))
Expand Down Expand Up @@ -229,7 +229,7 @@ func TestPassingThenFailingChecks(t *testing.T) {
func TestDeadlockRegression(t *testing.T) {
require := require.New(t)

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), emptyReport)
require.NoError(err)

var lock sync.Mutex
Expand Down Expand Up @@ -259,7 +259,7 @@ func TestTags(t *testing.T) {
return "", nil
})

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), emptyReport)
require.NoError(err)
require.NoError(h.RegisterHealthCheck("check1", check))
require.NoError(h.RegisterHealthCheck("check2", check, "tag1"))
Expand Down
4 changes: 2 additions & 2 deletions api/health/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestServiceResponses(t *testing.T) {
return "", nil
})

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), func(bool) {})
require.NoError(err)

s := &Service{
Expand Down Expand Up @@ -158,7 +158,7 @@ func TestServiceTagResponse(t *testing.T) {
t.Run(test.name, func(t *testing.T) {
require := require.New(t)

h, err := New(logging.NoLog{}, prometheus.NewRegistry())
h, err := New(logging.NoLog{}, prometheus.NewRegistry(), func(bool) {})
require.NoError(err)
require.NoError(test.register(h, "check1", check))
require.NoError(test.register(h, "check2", check, subnetID1.String()))
Expand Down
35 changes: 22 additions & 13 deletions api/health/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"maps"
"slices"
"sync"
"sync/atomic"
"time"

"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -39,10 +40,11 @@ type worker struct {
numFailingApplicationChecks int
tags map[string]set.Set[string] // tag -> set of check names

startOnce sync.Once
closeOnce sync.Once
wg sync.WaitGroup
closer chan struct{}
startOnce sync.Once
closeOnce sync.Once
wg sync.WaitGroup
closer chan struct{}
reportUnhealthy func(bool)
}

type taggedChecker struct {
Expand All @@ -55,6 +57,7 @@ func newWorker(
log logging.Logger,
name string,
failingChecks *prometheus.GaugeVec,
reportUnhealthy func(bool),
) *worker {
// Initialize the number of failing checks to 0 for all checks
for _, tag := range []string{AllTag, ApplicationTag} {
Expand All @@ -64,13 +67,14 @@ func newWorker(
}).Set(0)
}
return &worker{
log: log,
name: name,
failingChecks: failingChecks,
checks: make(map[string]*taggedChecker),
results: make(map[string]Result),
closer: make(chan struct{}),
tags: make(map[string]set.Set[string]),
reportUnhealthy: reportUnhealthy,
log: log,
name: name,
failingChecks: failingChecks,
checks: make(map[string]*taggedChecker),
results: make(map[string]Result),
closer: make(chan struct{}),
tags: make(map[string]set.Set[string]),
}
}

Expand Down Expand Up @@ -209,15 +213,19 @@ func (w *worker) runChecks(ctx context.Context) {
checks := maps.Clone(w.checks)
w.checksLock.RUnlock()

var hasSomeCheckFailed atomic.Bool

var wg sync.WaitGroup
wg.Add(len(checks))
for name, check := range checks {
go w.runCheck(ctx, &wg, name, check)
go w.runCheck(ctx, &wg, name, check, &hasSomeCheckFailed)
}
wg.Wait()

w.reportUnhealthy(hasSomeCheckFailed.Load())
}

func (w *worker) runCheck(ctx context.Context, wg *sync.WaitGroup, name string, check *taggedChecker) {
func (w *worker) runCheck(ctx context.Context, wg *sync.WaitGroup, name string, check *taggedChecker, checkStatus *atomic.Bool) {
defer wg.Done()

start := time.Now()
Expand All @@ -238,6 +246,7 @@ func (w *worker) runCheck(ctx context.Context, wg *sync.WaitGroup, name string,
defer w.resultsLock.Unlock()
prevResult := w.results[name]
if err != nil {
checkStatus.Store(true)
errString := err.Error()
result.Error = &errString

Expand Down
2 changes: 2 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ func getLoggingConfig(v *viper.Viper) (logging.Config, error) {
if v.IsSet(LogDisplayLevelKey) {
logDisplayLevel = v.GetString(LogDisplayLevelKey)
}
loggingConfig.LoggingAutoAmplification = v.GetBool(LogAutoAmplificationKey)
loggingConfig.LoggingAutoAmplificationMaxDuration = v.GetDuration(LoggingAutoAmplificationMaxDurationKey)
loggingConfig.DisplayLevel, err = logging.ToLevel(logDisplayLevel)
if err != nil {
return loggingConfig, err
Expand Down
2 changes: 2 additions & 0 deletions config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ func addNodeFlags(fs *pflag.FlagSet) {
fs.Uint(LogRotaterMaxAgeKey, 0, "The maximum number of days to retain old log files based on the timestamp encoded in their filename. 0 means retain all old log files.")
fs.Bool(LogRotaterCompressEnabledKey, false, "Enables the compression of rotated log files through gzip.")
fs.Bool(LogDisableDisplayPluginLogsKey, false, "Disables displaying plugin logs in stdout.")
fs.Bool(LogAutoAmplificationKey, false, "Amplifies log level to DEBUG when health checks fail, reverts back once they succeed again or duration defined by --log-auto-amplification-max-duration elapses.")
fs.Duration(LoggingAutoAmplificationMaxDurationKey, 5*time.Minute, "Sets the max consecutive duration logging is amplified (see --log-auto-amplification).")

// Peer List Gossip
fs.Uint(NetworkPeerListNumValidatorIPsKey, constants.DefaultNetworkPeerListNumValidatorIPs, "Number of validator IPs to gossip to other nodes")
Expand Down
2 changes: 2 additions & 0 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ const (
BenchlistFailThresholdKey = "benchlist-fail-threshold"
BenchlistDurationKey = "benchlist-duration"
BenchlistMinFailingDurationKey = "benchlist-min-failing-duration"
LogAutoAmplificationKey = "log-auto-amplification"
LoggingAutoAmplificationMaxDurationKey = "log-auto-amplification-max-duration"
LogsDirKey = "log-dir"
LogLevelKey = "log-level"
LogDisplayLevelKey = "log-display-level"
Expand Down
21 changes: 20 additions & 1 deletion node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,9 @@ type Node struct {

resourceManager resource.Manager

// Tracks bootstrapping of subnets
subnets *chains.Subnets

// Tracks the CPU/disk usage caused by processing
// messages of each peer.
resourceTracker tracker.ResourceTracker
Expand Down Expand Up @@ -1053,6 +1056,21 @@ func (n *Node) addDefaultVMAliases() error {
return nil
}

func (n *Node) reportUnhealthy(unhealthy bool) {
subnetsLeftToBootstrap := n.subnets.Bootstrapping()
if len(subnetsLeftToBootstrap) > 0 {
n.Log.Verbo("Still bootstrapping, will not change logging level",
zap.Int("subnets left to bootstrap", len(subnetsLeftToBootstrap)))
return
}

if unhealthy {
n.LogFactory.Amplify()
} else {
n.LogFactory.Revert()
}
}

// Create the chainManager and register the following VMs:
// AVM, Simple Payments DAG, Simple Payments Chain, and Platform VM
// Assumes n.DBManager, n.vdrs all initialized (non-nil)
Expand Down Expand Up @@ -1124,6 +1142,7 @@ func (n *Node) initChainManager(avaxAssetID ids.ID) error {
if err != nil {
return fmt.Errorf("failed to initialize subnets: %w", err)
}
n.subnets = subnets

n.chainManager, err = chains.New(
&chains.ManagerConfig{
Expand Down Expand Up @@ -1415,7 +1434,7 @@ func (n *Node) initHealthAPI() error {
return err
}

n.health, err = health.New(n.Log, healthReg)
n.health, err = health.New(n.Log, healthReg, n.reportUnhealthy)
if err != nil {
return err
}
Expand Down
Loading

0 comments on commit 20b9114

Please sign in to comment.