Skip to content

Commit bf3c32e

Browse files
committed
server: fault tolerance metrics
For each element of the locality tree, generate a gauge metric indicating the number of additional nodes that can fail, if that locality were to fail completely. The raw values for these metrics are not meaningful. They must be aggregated across all nodes within a failure domain to indicate the actual fault tolerance margin. Negative values indicate that a failure in this domain will cause at least one range to become unavailable. 0 indicates that this domain can fail without causing unavailability. Postitive values indicate the worst-case number of additional replicas that need to become unavailable to cause a range to become unavailable. Epic: none Fixes: https://cockroachlabs.atlassian.net/browse/TREQ-1099 Release note (ops change): the new `fault_tolerance.nodes` metric provides a view into the fault tolerance state of the cluster. The metric is produced for each locality. By taking the `min` of the value within a locality, you can determine the number of additional nodes that can fail if that locality fails, before any unavailability. This is the "fault tolerance margin" for that locality. This metric is responsive to node liveness changes and changes in range allocation.
1 parent 7898c83 commit bf3c32e

File tree

8 files changed

+707
-32
lines changed

8 files changed

+707
-32
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10201,6 +10201,15 @@ layers:
1020110201
essential: true
1020210202
- name: REPLICATION
1020310203
metrics:
10204+
- name: fault_tolerance.nodes
10205+
exported_name: fault_tolerance_nodes
10206+
description: "Number of nodes that can fail before we lose quorum on any range, \n if the labeled failure domain goes down. 0 indicates that we can tolerate a fault\n in the failure domain, but only just. Negative values indicate that the failure \n domain is critical to continued availability for at least one range."
10207+
y_axis_label: Nodes
10208+
type: GAUGE
10209+
unit: COUNT
10210+
aggregation: AVG
10211+
derivative: NONE
10212+
how_to_use: Take the minimum, faceted across all labels
1020410213
- name: leases.transfers.success
1020510214
exported_name: leases_transfers_success
1020610215
description: Number of successful lease transfers

pkg/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ ALL_TESTS = [
351351
"//pkg/server/diagnostics:diagnostics_disallowed_imports_test",
352352
"//pkg/server/diagnostics:diagnostics_test",
353353
"//pkg/server/dumpstore:dumpstore_test",
354+
"//pkg/server/faulttolerance:faulttolerance_test",
354355
"//pkg/server/goroutinedumper:goroutinedumper_test",
355356
"//pkg/server/license:license_test",
356357
"//pkg/server/pgurl:pgurl_test",
@@ -1745,6 +1746,8 @@ GO_TARGETS = [
17451746
"//pkg/server/diagnostics:diagnostics_test",
17461747
"//pkg/server/dumpstore:dumpstore",
17471748
"//pkg/server/dumpstore:dumpstore_test",
1749+
"//pkg/server/faulttolerance:faulttolerance",
1750+
"//pkg/server/faulttolerance:faulttolerance_test",
17481751
"//pkg/server/goroutinedumper:goroutinedumper",
17491752
"//pkg/server/goroutinedumper:goroutinedumper_test",
17501753
"//pkg/server/license:license",

pkg/roachpb/metadata_replicas.go

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,15 @@ type RangeStatusReport struct {
393393
UnderReplicatedNonVoters, OverReplicatedNonVoters bool
394394
}
395395

396+
// isBoth takes two replica predicates and returns their conjunction.
397+
func isBoth(
398+
pred1 func(rDesc ReplicaDescriptor) bool, pred2 func(rDesc ReplicaDescriptor) bool,
399+
) func(ReplicaDescriptor) bool {
400+
return func(rDesc ReplicaDescriptor) bool {
401+
return pred1(rDesc) && pred2(rDesc)
402+
}
403+
}
404+
396405
// ReplicationStatus returns availability and over/under-replication
397406
// determinations for the range.
398407
//
@@ -405,15 +414,31 @@ func (d ReplicaSet) ReplicationStatus(
405414
liveFunc func(descriptor ReplicaDescriptor) bool, neededVoters int, neededNonVoters int,
406415
) RangeStatusReport {
407416
var res RangeStatusReport
408-
// isBoth takes two replica predicates and returns their conjunction.
409-
isBoth := func(
410-
pred1 func(rDesc ReplicaDescriptor) bool,
411-
pred2 func(rDesc ReplicaDescriptor) bool) func(ReplicaDescriptor) bool {
412-
return func(rDesc ReplicaDescriptor) bool {
413-
return pred1(rDesc) && pred2(rDesc)
414-
}
417+
418+
rangeMargin := d.ReplicationMargin(liveFunc)
419+
420+
res.Available = rangeMargin >= 0
421+
422+
if neededVoters > 0 {
423+
voterQuorum := neededVoters/2 + 1
424+
res.UnderReplicated = voterQuorum+rangeMargin < neededVoters
425+
res.OverReplicated = voterQuorum+rangeMargin > neededVoters
426+
}
427+
428+
if neededNonVoters > -1 {
429+
nonVoters := d.FilterToDescriptors(ReplicaDescriptor.IsNonVoter)
430+
liveNonVoters := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsNonVoter, liveFunc))
431+
432+
res.UnderReplicatedNonVoters = len(liveNonVoters) < neededNonVoters
433+
res.OverReplicatedNonVoters = len(nonVoters) > neededNonVoters
415434
}
416435

436+
return res
437+
}
438+
439+
// ReplicationMargin computes the amount by which live voters exceed quorum. Values >= 0 indicate an available range,
440+
// as well as the number of additional replicas we can lose before losing quorum.
441+
func (d ReplicaSet) ReplicationMargin(liveFunc func(descriptor ReplicaDescriptor) bool) int {
417442
// This functions handles regular, or joint-consensus replica groups. In the
418443
// joint-consensus case, we'll independently consider the health of the
419444
// outgoing group ("old") and the incoming group ("new"). In the regular case,
@@ -422,35 +447,18 @@ func (d ReplicaSet) ReplicationStatus(
422447
votersOldGroup := d.FilterToDescriptors(ReplicaDescriptor.IsVoterOldConfig)
423448
liveVotersOldGroup := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsVoterOldConfig, liveFunc))
424449

425-
n := len(votersOldGroup)
426-
// Empty groups succeed by default, to match the Raft implementation.
427-
availableOutgoingGroup := (n == 0) || (len(liveVotersOldGroup) >= n/2+1)
428-
429450
votersNewGroup := d.FilterToDescriptors(ReplicaDescriptor.IsVoterNewConfig)
430451
liveVotersNewGroup := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsVoterNewConfig, liveFunc))
431452

432-
n = len(votersNewGroup)
433-
availableIncomingGroup := len(liveVotersNewGroup) >= n/2+1
434-
435-
res.Available = availableIncomingGroup && availableOutgoingGroup
436-
437-
// Determine over/under-replication of voting replicas. Note that learners
438-
// don't matter.
439-
underReplicatedOldGroup := len(liveVotersOldGroup) < neededVoters
440-
underReplicatedNewGroup := len(liveVotersNewGroup) < neededVoters
441-
overReplicatedOldGroup := len(votersOldGroup) > neededVoters
442-
overReplicatedNewGroup := len(votersNewGroup) > neededVoters
443-
res.UnderReplicated = underReplicatedOldGroup || underReplicatedNewGroup
444-
res.OverReplicated = overReplicatedOldGroup || overReplicatedNewGroup
445-
if neededNonVoters == -1 {
446-
return res
453+
n := len(votersNewGroup)
454+
marginIncomingGroup := len(liveVotersNewGroup) - (n/2 + 1)
455+
marginOutgoingGroup := marginIncomingGroup
456+
// Empty groups succeed by default, to match the Raft implementation.
457+
if len(votersOldGroup) > 0 {
458+
n := len(votersOldGroup)
459+
marginOutgoingGroup = len(liveVotersOldGroup) - (n/2 + 1)
447460
}
448-
449-
nonVoters := d.FilterToDescriptors(ReplicaDescriptor.IsNonVoter)
450-
liveNonVoters := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsNonVoter, liveFunc))
451-
res.UnderReplicatedNonVoters = len(liveNonVoters) < neededNonVoters
452-
res.OverReplicatedNonVoters = len(nonVoters) > neededNonVoters
453-
return res
461+
return min(marginIncomingGroup, marginOutgoingGroup)
454462
}
455463

456464
// Empty returns true if `target` is an empty replication target.

pkg/server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ go_library(
179179
"//pkg/server/decommissioning",
180180
"//pkg/server/diagnostics",
181181
"//pkg/server/diagnostics/diagnosticspb",
182+
"//pkg/server/faulttolerance",
182183
"//pkg/server/goroutinedumper",
183184
"//pkg/server/license",
184185
"//pkg/server/pgurl",

pkg/server/faulttolerance/BUILD.bazel

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
2+
3+
go_library(
4+
name = "faulttolerance",
5+
srcs = ["locality.go"],
6+
importpath = "github.com/cockroachdb/cockroach/pkg/server/faulttolerance",
7+
visibility = ["//visibility:public"],
8+
deps = [
9+
"//pkg/gossip",
10+
"//pkg/kv/kvserver",
11+
"//pkg/kv/kvserver/liveness/livenesspb",
12+
"//pkg/roachpb",
13+
"//pkg/util/protoutil",
14+
"@com_github_cockroachdb_errors//:errors",
15+
],
16+
)
17+
18+
go_test(
19+
name = "faulttolerance_test",
20+
srcs = ["locality_test.go"],
21+
embed = [":faulttolerance"],
22+
deps = [
23+
"//pkg/kv/kvserver/liveness/livenesspb",
24+
"//pkg/roachpb",
25+
"@com_github_stretchr_testify//require",
26+
],
27+
)

pkg/server/faulttolerance/locality.go

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
// Package faulttolerance provides visibility into the cluster's ability
7+
// to tolerate failures across different failure domains.
8+
package faulttolerance
9+
10+
import (
11+
"context"
12+
"iter"
13+
"maps"
14+
"slices"
15+
"strings"
16+
17+
"github.com/cockroachdb/cockroach/pkg/gossip"
18+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
19+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
20+
"github.com/cockroachdb/cockroach/pkg/roachpb"
21+
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
22+
"github.com/cockroachdb/errors"
23+
)
24+
25+
// LocalityMapsFromGossip constructs a map from each NodeID to its
26+
// Locality, by deserializing the node descriptors from Gossip.
27+
func LocalityMapsFromGossip(g *gossip.Gossip) (map[roachpb.NodeID]roachpb.Locality, error) {
28+
nodeLocality := make(map[roachpb.NodeID]roachpb.Locality)
29+
if err := g.IterateInfos(gossip.KeyNodeDescPrefix, func(key string, info gossip.Info) error {
30+
bs, err := info.Value.GetBytes()
31+
if err != nil {
32+
return errors.NewAssertionErrorWithWrappedErrf(err,
33+
"failed to extract bytes for key %q", key)
34+
}
35+
36+
var d roachpb.NodeDescriptor
37+
if err := protoutil.Unmarshal(bs, &d); err != nil {
38+
return errors.NewAssertionErrorWithWrappedErrf(err,
39+
"failed to parse value for key %q", key)
40+
}
41+
42+
// Don't use node descriptors with NodeID 0, because that's meant to
43+
// indicate that the node has been removed from the cluster.
44+
if d.NodeID != 0 {
45+
nodeLocality[d.NodeID] = d.Locality
46+
}
47+
return nil
48+
}); err != nil {
49+
return nil, err
50+
}
51+
return nodeLocality, nil
52+
}
53+
54+
// ComputeFaultTolerance computes the fault tolerance margin for each
55+
// failure domain. A failure domain is a set of nodes that map to a
56+
// single locality in the nodeLocality map, or a merged set of nodes
57+
// that share a locality prefix. The fault tolerance margin is the
58+
// number of additional nodes (beyond the nodes in the failure domain)
59+
// that can fail before any range experiences unavailability.
60+
//
61+
// Margins can be negative (indicating that the failure domain is
62+
// critical; its failure will cause unavailability), 0 (the failure
63+
// domain is not critical, but no additional node failures can be
64+
// tolerated), or positive.
65+
//
66+
// The keys in the returned map are locality strings, as returned by
67+
// `roachpb.Locality.String()`. They include "parent localities" as
68+
// well as the actual leaf localities to which nodes are assigned.
69+
//
70+
// The fault tolerance computation occurs in the context of the
71+
// livenessMap (to determine any existing node failures), and is
72+
// evaluated for only the replicas that appear in the replicas Seq.
73+
//
74+
// The evaluation for each replica is independent: it is valid to merge
75+
// the results of this computation for different sets of replicas by
76+
// taking the `min` of the margin values.
77+
func ComputeFaultTolerance(
78+
ctx context.Context,
79+
livenessMap livenesspb.NodeVitalityMap,
80+
nodeLocality map[roachpb.NodeID]roachpb.Locality,
81+
replicas iter.Seq[*kvserver.Replica],
82+
) (map[string]int, error) {
83+
return computeFaultToleranceImpl(ctx, livenessMap, nodeLocality, replicas)
84+
}
85+
86+
type replicaDescriber interface {
87+
Desc() *roachpb.RangeDescriptor
88+
}
89+
90+
func computeFaultToleranceImpl[RD replicaDescriber](
91+
ctx context.Context,
92+
livenessMap livenesspb.NodeVitalityMap,
93+
nodeLocality map[roachpb.NodeID]roachpb.Locality,
94+
replicas iter.Seq[RD],
95+
) (map[string]int, error) {
96+
failureDomainMap := makeFailureDomains(nodeLocality)
97+
98+
domainMargins := make(map[string]int)
99+
for replica := range replicas {
100+
for domainKey, fd := range failureDomainMap {
101+
if err := ctx.Err(); err != nil {
102+
return nil, err
103+
}
104+
105+
margin := replica.Desc().Replicas().ReplicationMargin(func(rd roachpb.ReplicaDescriptor) bool {
106+
if _, ok := fd.nodes[rd.NodeID]; ok {
107+
return false
108+
}
109+
return livenessMap[rd.NodeID].IsLive(livenesspb.SpanConfigConformance)
110+
})
111+
112+
if oldMargin, ok := domainMargins[domainKey]; ok {
113+
domainMargins[domainKey] = min(oldMargin, margin)
114+
} else {
115+
domainMargins[domainKey] = margin
116+
}
117+
}
118+
}
119+
return domainMargins, nil
120+
}
121+
122+
func makeFailureDomains(
123+
nodeLocality map[roachpb.NodeID]roachpb.Locality,
124+
) map[string]*failureDomain {
125+
domainMap := make(map[string]*failureDomain)
126+
var unresolved []string
127+
// First, construct the leaf domains.
128+
for _, l := range nodeLocality {
129+
k := l.String()
130+
if _, ok := domainMap[k]; !ok {
131+
domainMap[k] = newFailureDomain(l, nodeLocality)
132+
}
133+
unresolved = append(unresolved, k)
134+
}
135+
136+
// Sort the domains by descending length. In case the depth of the
137+
// locality tree varies, we want to handle the taller trees first.
138+
slices.SortStableFunc(unresolved, func(a, b string) int {
139+
aComma := strings.Count(a, ",")
140+
bComma := strings.Count(b, ",")
141+
return bComma - aComma
142+
})
143+
144+
// Merge existing domains into parent domains.
145+
for len(unresolved) > 0 {
146+
fd := domainMap[unresolved[0]]
147+
unresolved = unresolved[1:]
148+
149+
pdKey := fd.parentKey()
150+
if pdKey == "" {
151+
continue
152+
}
153+
if parentFailureDomain, ok := domainMap[pdKey]; !ok {
154+
// new unresolved parent domain
155+
pd := fd.parent()
156+
domainMap[pdKey] = pd
157+
unresolved = append(unresolved, pdKey)
158+
} else {
159+
// merge child into parent domain
160+
parentFailureDomain.merge(fd)
161+
}
162+
}
163+
return domainMap
164+
}
165+
166+
type failureDomain struct {
167+
domain roachpb.Locality
168+
nodes map[roachpb.NodeID]struct{}
169+
}
170+
171+
func (fd *failureDomain) merge(rhs *failureDomain) {
172+
if match, _ := rhs.domain.Matches(fd.domain); !match {
173+
panic("cannot merge failure domain")
174+
}
175+
176+
for n := range rhs.nodes {
177+
fd.nodes[n] = struct{}{}
178+
}
179+
}
180+
181+
func (fd *failureDomain) parentKey() string {
182+
return roachpb.Locality{Tiers: fd.domain.Tiers[0 : len(fd.domain.Tiers)-1]}.String()
183+
}
184+
185+
func newFailureDomain(
186+
domain roachpb.Locality, nodeLocality map[roachpb.NodeID]roachpb.Locality,
187+
) *failureDomain {
188+
faultScenario := make(map[roachpb.NodeID]struct{})
189+
for node := range nodeLocality {
190+
if match, _ := nodeLocality[node].Matches(domain); match {
191+
faultScenario[node] = struct{}{}
192+
}
193+
}
194+
return &failureDomain{
195+
domain: domain,
196+
nodes: faultScenario,
197+
}
198+
}
199+
200+
func (fd *failureDomain) parent() *failureDomain {
201+
pd := roachpb.Locality{Tiers: fd.domain.Tiers[0 : len(fd.domain.Tiers)-1]}
202+
nodes := make(map[roachpb.NodeID]struct{}, len(fd.nodes))
203+
maps.Copy(nodes, fd.nodes)
204+
return &failureDomain{
205+
domain: pd,
206+
nodes: nodes,
207+
}
208+
}

0 commit comments

Comments
 (0)