Skip to content

Commit c60341e

Browse files
committed
server: fault tolerance metrics
For each element of the locality tree, generate a gauge metric indicating the number of additional nodes that can fail, if that locality were to fail completely. The raw values for these metrics are not meaningful. They must be aggregated across all nodes within a failure domain to indicate the actual fault tolerance margin. Negative values indicate that a failure in this domain will cause at least one range to become unavailable. 0 indicates that this domain can fail without causing unavailability. Postitive values indicate the worst-case number of additional replicas that need to become unavailable to cause a range to become unavailable. Epic: none Fixes: https://cockroachlabs.atlassian.net/browse/TREQ-1099 Release note (ops change): the new `fault_tolerance.nodes` metric provides a view into the fault tolerance state of the cluster. The metric is produced for each locality. By taking the `min` of the value within a locality, you can determine the number of additional nodes that can fail if that locality fails, before any unavailability. This is the "fault tolerance margin" for that locality. This metric is responsive to node liveness changes and changes in range allocation.
1 parent 7898c83 commit c60341e

File tree

8 files changed

+739
-26
lines changed

8 files changed

+739
-26
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10201,6 +10201,15 @@ layers:
1020110201
essential: true
1020210202
- name: REPLICATION
1020310203
metrics:
10204+
- name: fault_tolerance.nodes
10205+
exported_name: fault_tolerance_nodes
10206+
description: "Number of nodes that can fail before we lose quorum on any range, \n if the labeled failure domain goes down. 0 indicates that we can tolerate a fault\n in the failure domain, but only just. Negative values indicate that the failure \n domain is critical to continued availability for at least one range."
10207+
y_axis_label: Nodes
10208+
type: GAUGE
10209+
unit: COUNT
10210+
aggregation: AVG
10211+
derivative: NONE
10212+
how_to_use: Take the minimum, faceted across all labels
1020410213
- name: leases.transfers.success
1020510214
exported_name: leases_transfers_success
1020610215
description: Number of successful lease transfers

pkg/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ ALL_TESTS = [
351351
"//pkg/server/diagnostics:diagnostics_disallowed_imports_test",
352352
"//pkg/server/diagnostics:diagnostics_test",
353353
"//pkg/server/dumpstore:dumpstore_test",
354+
"//pkg/server/faulttolerance:faulttolerance_test",
354355
"//pkg/server/goroutinedumper:goroutinedumper_test",
355356
"//pkg/server/license:license_test",
356357
"//pkg/server/pgurl:pgurl_test",
@@ -1745,6 +1746,8 @@ GO_TARGETS = [
17451746
"//pkg/server/diagnostics:diagnostics_test",
17461747
"//pkg/server/dumpstore:dumpstore",
17471748
"//pkg/server/dumpstore:dumpstore_test",
1749+
"//pkg/server/faulttolerance:faulttolerance",
1750+
"//pkg/server/faulttolerance:faulttolerance_test",
17481751
"//pkg/server/goroutinedumper:goroutinedumper",
17491752
"//pkg/server/goroutinedumper:goroutinedumper_test",
17501753
"//pkg/server/license:license",

pkg/roachpb/metadata_replicas.go

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -393,66 +393,87 @@ type RangeStatusReport struct {
393393
UnderReplicatedNonVoters, OverReplicatedNonVoters bool
394394
}
395395

396+
// countLive will first filter the ReplicaSet using filterPred, then
397+
// count the live descriptors using livePred. It returns the size of
398+
// both sets.
399+
func (d ReplicaSet) countLive(
400+
filterPred func(rDesc ReplicaDescriptor) bool, livePred func(rDesc ReplicaDescriptor) bool,
401+
) (total, live int) {
402+
filtered := d.FilterToDescriptors(filterPred)
403+
wrappedFiltered := ReplicaSet{wrapped: filtered}
404+
liveDesc := wrappedFiltered.FilterToDescriptors(livePred)
405+
406+
return len(filtered), len(liveDesc)
407+
}
408+
396409
// ReplicationStatus returns availability and over/under-replication
397410
// determinations for the range.
398411
//
399412
// neededVoters is the replica's desired replication for purposes of determining
400-
// over/under-replication of voters. If the caller is only interested in
413+
// over/under-replication of voters. If the caller is only interested in the
401414
// availability of voting replicas, 0 can be passed in. neededNonVoters is the
402415
// counterpart for non-voting replicas but with -1 as the sentinel value (unlike
403416
// voters, it's possible to expect 0 non-voters).
404417
func (d ReplicaSet) ReplicationStatus(
405418
liveFunc func(descriptor ReplicaDescriptor) bool, neededVoters int, neededNonVoters int,
406419
) RangeStatusReport {
407420
var res RangeStatusReport
408-
// isBoth takes two replica predicates and returns their conjunction.
409-
isBoth := func(
410-
pred1 func(rDesc ReplicaDescriptor) bool,
411-
pred2 func(rDesc ReplicaDescriptor) bool) func(ReplicaDescriptor) bool {
412-
return func(rDesc ReplicaDescriptor) bool {
413-
return pred1(rDesc) && pred2(rDesc)
414-
}
415-
}
416421

417422
// This functions handles regular, or joint-consensus replica groups. In the
418423
// joint-consensus case, we'll independently consider the health of the
419424
// outgoing group ("old") and the incoming group ("new"). In the regular case,
420425
// the two groups will be identical.
421426

422-
votersOldGroup := d.FilterToDescriptors(ReplicaDescriptor.IsVoterOldConfig)
423-
liveVotersOldGroup := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsVoterOldConfig, liveFunc))
427+
oldGroup, oldLive := d.countLive(ReplicaDescriptor.IsVoterOldConfig, liveFunc)
428+
newGroup, newLive := d.countLive(ReplicaDescriptor.IsVoterNewConfig, liveFunc)
424429

425-
n := len(votersOldGroup)
426430
// Empty groups succeed by default, to match the Raft implementation.
427-
availableOutgoingGroup := (n == 0) || (len(liveVotersOldGroup) >= n/2+1)
431+
availableOutgoingGroup := (oldGroup == 0) || (oldLive >= oldGroup/2+1)
428432

429-
votersNewGroup := d.FilterToDescriptors(ReplicaDescriptor.IsVoterNewConfig)
430-
liveVotersNewGroup := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsVoterNewConfig, liveFunc))
431-
432-
n = len(votersNewGroup)
433-
availableIncomingGroup := len(liveVotersNewGroup) >= n/2+1
433+
availableIncomingGroup := newLive >= newGroup/2+1
434434

435435
res.Available = availableIncomingGroup && availableOutgoingGroup
436436

437437
// Determine over/under-replication of voting replicas. Note that learners
438438
// don't matter.
439-
underReplicatedOldGroup := len(liveVotersOldGroup) < neededVoters
440-
underReplicatedNewGroup := len(liveVotersNewGroup) < neededVoters
441-
overReplicatedOldGroup := len(votersOldGroup) > neededVoters
442-
overReplicatedNewGroup := len(votersNewGroup) > neededVoters
439+
underReplicatedOldGroup := oldLive < neededVoters
440+
underReplicatedNewGroup := newLive < neededVoters
441+
overReplicatedOldGroup := oldGroup > neededVoters
442+
overReplicatedNewGroup := newGroup > neededVoters
443443
res.UnderReplicated = underReplicatedOldGroup || underReplicatedNewGroup
444444
res.OverReplicated = overReplicatedOldGroup || overReplicatedNewGroup
445445
if neededNonVoters == -1 {
446446
return res
447447
}
448448

449-
nonVoters := d.FilterToDescriptors(ReplicaDescriptor.IsNonVoter)
450-
liveNonVoters := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsNonVoter, liveFunc))
451-
res.UnderReplicatedNonVoters = len(liveNonVoters) < neededNonVoters
452-
res.OverReplicatedNonVoters = len(nonVoters) > neededNonVoters
449+
nonVoters, liveNonVoters := d.countLive(ReplicaDescriptor.IsNonVoter, liveFunc)
450+
451+
res.UnderReplicatedNonVoters = liveNonVoters < neededNonVoters
452+
res.OverReplicatedNonVoters = nonVoters > neededNonVoters
453+
453454
return res
454455
}
455456

457+
// ReplicationMargin computes the amount by which live voters exceed quorum. Values >= 0 indicate an available range,
458+
// as well as the number of additional replicas we can lose before losing quorum.
459+
func (d ReplicaSet) ReplicationMargin(liveFunc func(descriptor ReplicaDescriptor) bool) int {
460+
// This functions handles regular, or joint-consensus replica groups. In the
461+
// joint-consensus case, we'll independently consider the health of the
462+
// outgoing group ("old") and the incoming group ("new"). In the regular case,
463+
// the two groups will be identical.
464+
465+
oldGroup, oldLive := d.countLive(ReplicaDescriptor.IsVoterOldConfig, liveFunc)
466+
newGroup, newLive := d.countLive(ReplicaDescriptor.IsVoterNewConfig, liveFunc)
467+
468+
marginIncomingGroup := newLive - (newGroup/2 + 1)
469+
marginOutgoingGroup := marginIncomingGroup
470+
// Empty groups succeed by default, to match the Raft implementation.
471+
if oldGroup > 0 {
472+
marginOutgoingGroup = oldLive - (oldGroup/2 + 1)
473+
}
474+
return min(marginIncomingGroup, marginOutgoingGroup)
475+
}
476+
456477
// Empty returns true if `target` is an empty replication target.
457478
func Empty(target ReplicationTarget) bool {
458479
return target == ReplicationTarget{}

pkg/server/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ go_library(
179179
"//pkg/server/decommissioning",
180180
"//pkg/server/diagnostics",
181181
"//pkg/server/diagnostics/diagnosticspb",
182+
"//pkg/server/faulttolerance",
182183
"//pkg/server/goroutinedumper",
183184
"//pkg/server/license",
184185
"//pkg/server/pgurl",

pkg/server/faulttolerance/BUILD.bazel

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
2+
3+
go_library(
4+
name = "faulttolerance",
5+
srcs = ["locality.go"],
6+
importpath = "github.com/cockroachdb/cockroach/pkg/server/faulttolerance",
7+
visibility = ["//visibility:public"],
8+
deps = [
9+
"//pkg/gossip",
10+
"//pkg/kv/kvserver",
11+
"//pkg/kv/kvserver/liveness/livenesspb",
12+
"//pkg/roachpb",
13+
"//pkg/util/protoutil",
14+
"@com_github_cockroachdb_errors//:errors",
15+
],
16+
)
17+
18+
go_test(
19+
name = "faulttolerance_test",
20+
srcs = ["locality_test.go"],
21+
embed = [":faulttolerance"],
22+
deps = [
23+
"//pkg/kv/kvserver/liveness/livenesspb",
24+
"//pkg/roachpb",
25+
"@com_github_stretchr_testify//require",
26+
],
27+
)

pkg/server/faulttolerance/locality.go

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
// Package faulttolerance provides visibility into the cluster's ability
7+
// to tolerate failures across different failure domains.
8+
package faulttolerance
9+
10+
import (
11+
"context"
12+
"iter"
13+
"maps"
14+
"slices"
15+
"strings"
16+
17+
"github.com/cockroachdb/cockroach/pkg/gossip"
18+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
19+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
20+
"github.com/cockroachdb/cockroach/pkg/roachpb"
21+
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
22+
"github.com/cockroachdb/errors"
23+
)
24+
25+
// LocalityMapsFromGossip constructs a map from each NodeID to its
26+
// Locality, by deserializing the node descriptors from Gossip.
27+
func LocalityMapsFromGossip(g *gossip.Gossip) (map[roachpb.NodeID]roachpb.Locality, error) {
28+
nodeLocality := make(map[roachpb.NodeID]roachpb.Locality)
29+
if err := g.IterateInfos(gossip.KeyNodeDescPrefix, func(key string, info gossip.Info) error {
30+
bs, err := info.Value.GetBytes()
31+
if err != nil {
32+
return errors.NewAssertionErrorWithWrappedErrf(err,
33+
"failed to extract bytes for key %q", key)
34+
}
35+
36+
var d roachpb.NodeDescriptor
37+
if err := protoutil.Unmarshal(bs, &d); err != nil {
38+
return errors.NewAssertionErrorWithWrappedErrf(err,
39+
"failed to parse value for key %q", key)
40+
}
41+
42+
// Don't use node descriptors with NodeID 0, because that's meant to
43+
// indicate that the node has been removed from the cluster.
44+
if d.NodeID != 0 {
45+
nodeLocality[d.NodeID] = d.Locality
46+
}
47+
return nil
48+
}); err != nil {
49+
return nil, err
50+
}
51+
return nodeLocality, nil
52+
}
53+
54+
// ComputeFaultTolerance computes the fault tolerance margin for each
55+
// failure domain. A failure domain is a set of nodes that map to a
56+
// single locality in the nodeLocality map, or a merged set of nodes
57+
// that share a locality prefix. The fault tolerance margin is the
58+
// number of additional nodes (beyond the nodes in the failure domain)
59+
// that can fail before any range experiences unavailability.
60+
//
61+
// Margins can be negative (indicating that the failure domain is
62+
// critical; its failure will cause unavailability), 0 (the failure
63+
// domain is not critical, but no additional node failures can be
64+
// tolerated), or positive.
65+
//
66+
// The keys in the returned map are locality strings, as returned by
67+
// `roachpb.Locality.String()`. They include "parent localities" as
68+
// well as the actual leaf localities to which nodes are assigned.
69+
//
70+
// The fault tolerance computation occurs in the context of the
71+
// livenessMap (to determine any existing node failures), and is
72+
// evaluated for only the replicas that appear in the replicas Seq.
73+
//
74+
// The evaluation for each replica is independent: it is valid to merge
75+
// the results of this computation for different sets of replicas by
76+
// taking the `min` of the margin values.
77+
func ComputeFaultTolerance(
78+
ctx context.Context,
79+
livenessMap livenesspb.NodeVitalityMap,
80+
nodeLocality map[roachpb.NodeID]roachpb.Locality,
81+
replicas iter.Seq[*kvserver.Replica],
82+
) (map[string]int, error) {
83+
return computeFaultToleranceImpl(ctx, livenessMap, nodeLocality, replicas)
84+
}
85+
86+
type replicaDescriber interface {
87+
Desc() *roachpb.RangeDescriptor
88+
}
89+
90+
func computeFaultToleranceImpl[RD replicaDescriber](
91+
ctx context.Context,
92+
livenessMap livenesspb.NodeVitalityMap,
93+
nodeLocality map[roachpb.NodeID]roachpb.Locality,
94+
replicas iter.Seq[RD],
95+
) (map[string]int, error) {
96+
for n, l := range nodeLocality {
97+
if len(l.Tiers) == 0 {
98+
return nil, errors.AssertionFailedf("node %d missing locality", n)
99+
}
100+
}
101+
failureDomainMap := makeFailureDomains(nodeLocality)
102+
103+
domainMargins := make(map[string]int)
104+
for replica := range replicas {
105+
for domainKey, fd := range failureDomainMap {
106+
if err := ctx.Err(); err != nil {
107+
return nil, err
108+
}
109+
110+
margin := replica.Desc().Replicas().ReplicationMargin(func(rd roachpb.ReplicaDescriptor) bool {
111+
if _, ok := fd.nodes[rd.NodeID]; ok {
112+
return false
113+
}
114+
return livenessMap[rd.NodeID].IsLive(livenesspb.SpanConfigConformance)
115+
})
116+
117+
if oldMargin, ok := domainMargins[domainKey]; ok {
118+
domainMargins[domainKey] = min(oldMargin, margin)
119+
} else {
120+
domainMargins[domainKey] = margin
121+
}
122+
}
123+
}
124+
return domainMargins, nil
125+
}
126+
127+
func makeFailureDomains(
128+
nodeLocality map[roachpb.NodeID]roachpb.Locality,
129+
) map[string]*failureDomain {
130+
domainMap := make(map[string]*failureDomain)
131+
var unresolved []string
132+
// First, construct the leaf domains.
133+
for _, l := range nodeLocality {
134+
k := l.String()
135+
if _, ok := domainMap[k]; !ok {
136+
domainMap[k] = newFailureDomain(l, nodeLocality)
137+
}
138+
unresolved = append(unresolved, k)
139+
}
140+
141+
// Sort the domains by descending length. In case the depth of the
142+
// locality tree varies, we want to handle the taller trees first.
143+
slices.SortStableFunc(unresolved, func(a, b string) int {
144+
aComma := strings.Count(a, ",")
145+
bComma := strings.Count(b, ",")
146+
return bComma - aComma
147+
})
148+
149+
// Merge existing domains into parent domains.
150+
for len(unresolved) > 0 {
151+
fd := domainMap[unresolved[0]]
152+
unresolved = unresolved[1:]
153+
154+
pdKey := fd.parentKey()
155+
if pdKey == "" {
156+
continue
157+
}
158+
if parentFailureDomain, ok := domainMap[pdKey]; !ok {
159+
// new unresolved parent domain
160+
pd := fd.parent()
161+
domainMap[pdKey] = pd
162+
unresolved = append(unresolved, pdKey)
163+
} else {
164+
// merge child into parent domain
165+
parentFailureDomain.merge(fd)
166+
}
167+
}
168+
return domainMap
169+
}
170+
171+
type failureDomain struct {
172+
domain roachpb.Locality
173+
nodes map[roachpb.NodeID]struct{}
174+
}
175+
176+
func (fd *failureDomain) merge(rhs *failureDomain) {
177+
if match, _ := rhs.domain.Matches(fd.domain); !match {
178+
panic("cannot merge failure domain")
179+
}
180+
181+
for n := range rhs.nodes {
182+
fd.nodes[n] = struct{}{}
183+
}
184+
}
185+
186+
func (fd *failureDomain) parentKey() string {
187+
return roachpb.Locality{Tiers: fd.domain.Tiers[0 : len(fd.domain.Tiers)-1]}.String()
188+
}
189+
190+
func newFailureDomain(
191+
domain roachpb.Locality, nodeLocality map[roachpb.NodeID]roachpb.Locality,
192+
) *failureDomain {
193+
faultScenario := make(map[roachpb.NodeID]struct{})
194+
for node := range nodeLocality {
195+
if match, _ := nodeLocality[node].Matches(domain); match {
196+
faultScenario[node] = struct{}{}
197+
}
198+
}
199+
return &failureDomain{
200+
domain: domain,
201+
nodes: faultScenario,
202+
}
203+
}
204+
205+
func (fd *failureDomain) parent() *failureDomain {
206+
pd := roachpb.Locality{Tiers: fd.domain.Tiers[0 : len(fd.domain.Tiers)-1]}
207+
nodes := make(map[roachpb.NodeID]struct{}, len(fd.nodes))
208+
maps.Copy(nodes, fd.nodes)
209+
return &failureDomain{
210+
domain: pd,
211+
nodes: nodes,
212+
}
213+
}

0 commit comments

Comments
 (0)