diff --git a/docs/generated/metrics/metrics.yaml b/docs/generated/metrics/metrics.yaml index 7fe4416b4ae9..09327bfa7d78 100644 --- a/docs/generated/metrics/metrics.yaml +++ b/docs/generated/metrics/metrics.yaml @@ -10201,6 +10201,15 @@ layers: essential: true - name: REPLICATION metrics: + - name: fault_tolerance.nodes + exported_name: fault_tolerance_nodes + description: "Number of nodes that can fail before we lose quorum on any range, \n if the labeled failure domain goes down. 0 indicates that we can tolerate a fault\n in the failure domain, but only just. Negative values indicate that the failure \n domain is critical to continued availability for at least one range." + y_axis_label: Nodes + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE + how_to_use: Take the minimum, faceted across all labels - name: leases.transfers.success exported_name: leases_transfers_success description: Number of successful lease transfers diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 34381e655f75..e829f5c0b89d 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -351,6 +351,7 @@ ALL_TESTS = [ "//pkg/server/diagnostics:diagnostics_disallowed_imports_test", "//pkg/server/diagnostics:diagnostics_test", "//pkg/server/dumpstore:dumpstore_test", + "//pkg/server/faulttolerance:faulttolerance_test", "//pkg/server/goroutinedumper:goroutinedumper_test", "//pkg/server/license:license_test", "//pkg/server/pgurl:pgurl_test", @@ -1745,6 +1746,8 @@ GO_TARGETS = [ "//pkg/server/diagnostics:diagnostics_test", "//pkg/server/dumpstore:dumpstore", "//pkg/server/dumpstore:dumpstore_test", + "//pkg/server/faulttolerance:faulttolerance", + "//pkg/server/faulttolerance:faulttolerance_test", "//pkg/server/goroutinedumper:goroutinedumper", "//pkg/server/goroutinedumper:goroutinedumper_test", "//pkg/server/license:license", diff --git a/pkg/roachpb/metadata_replicas.go b/pkg/roachpb/metadata_replicas.go index fc6db641bf4f..ad32b17e9b1b 100644 --- a/pkg/roachpb/metadata_replicas.go +++ b/pkg/roachpb/metadata_replicas.go @@ -393,11 +393,24 @@ type RangeStatusReport struct { UnderReplicatedNonVoters, OverReplicatedNonVoters bool } +// countLive will first filter the ReplicaSet using filterPred, then +// count the live descriptors using livePred. It returns the size of +// both sets. +func (d ReplicaSet) countLive( + filterPred func(rDesc ReplicaDescriptor) bool, livePred func(rDesc ReplicaDescriptor) bool, +) (total, live int) { + filtered := d.FilterToDescriptors(filterPred) + wrappedFiltered := ReplicaSet{wrapped: filtered} + liveDesc := wrappedFiltered.FilterToDescriptors(livePred) + + return len(filtered), len(liveDesc) +} + // ReplicationStatus returns availability and over/under-replication // determinations for the range. // // neededVoters is the replica's desired replication for purposes of determining -// over/under-replication of voters. If the caller is only interested in +// over/under-replication of voters. If the caller is only interested in the // availability of voting replicas, 0 can be passed in. neededNonVoters is the // counterpart for non-voting replicas but with -1 as the sentinel value (unlike // voters, it's possible to expect 0 non-voters). @@ -405,54 +418,62 @@ func (d ReplicaSet) ReplicationStatus( liveFunc func(descriptor ReplicaDescriptor) bool, neededVoters int, neededNonVoters int, ) RangeStatusReport { var res RangeStatusReport - // isBoth takes two replica predicates and returns their conjunction. - isBoth := func( - pred1 func(rDesc ReplicaDescriptor) bool, - pred2 func(rDesc ReplicaDescriptor) bool) func(ReplicaDescriptor) bool { - return func(rDesc ReplicaDescriptor) bool { - return pred1(rDesc) && pred2(rDesc) - } - } // This functions handles regular, or joint-consensus replica groups. In the // joint-consensus case, we'll independently consider the health of the // outgoing group ("old") and the incoming group ("new"). In the regular case, // the two groups will be identical. - votersOldGroup := d.FilterToDescriptors(ReplicaDescriptor.IsVoterOldConfig) - liveVotersOldGroup := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsVoterOldConfig, liveFunc)) + oldGroup, oldLive := d.countLive(ReplicaDescriptor.IsVoterOldConfig, liveFunc) + newGroup, newLive := d.countLive(ReplicaDescriptor.IsVoterNewConfig, liveFunc) - n := len(votersOldGroup) // Empty groups succeed by default, to match the Raft implementation. - availableOutgoingGroup := (n == 0) || (len(liveVotersOldGroup) >= n/2+1) + availableOutgoingGroup := (oldGroup == 0) || (oldLive >= oldGroup/2+1) - votersNewGroup := d.FilterToDescriptors(ReplicaDescriptor.IsVoterNewConfig) - liveVotersNewGroup := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsVoterNewConfig, liveFunc)) - - n = len(votersNewGroup) - availableIncomingGroup := len(liveVotersNewGroup) >= n/2+1 + availableIncomingGroup := newLive >= newGroup/2+1 res.Available = availableIncomingGroup && availableOutgoingGroup // Determine over/under-replication of voting replicas. Note that learners // don't matter. - underReplicatedOldGroup := len(liveVotersOldGroup) < neededVoters - underReplicatedNewGroup := len(liveVotersNewGroup) < neededVoters - overReplicatedOldGroup := len(votersOldGroup) > neededVoters - overReplicatedNewGroup := len(votersNewGroup) > neededVoters + underReplicatedOldGroup := oldLive < neededVoters + underReplicatedNewGroup := newLive < neededVoters + overReplicatedOldGroup := oldGroup > neededVoters + overReplicatedNewGroup := newGroup > neededVoters res.UnderReplicated = underReplicatedOldGroup || underReplicatedNewGroup res.OverReplicated = overReplicatedOldGroup || overReplicatedNewGroup if neededNonVoters == -1 { return res } - nonVoters := d.FilterToDescriptors(ReplicaDescriptor.IsNonVoter) - liveNonVoters := d.FilterToDescriptors(isBoth(ReplicaDescriptor.IsNonVoter, liveFunc)) - res.UnderReplicatedNonVoters = len(liveNonVoters) < neededNonVoters - res.OverReplicatedNonVoters = len(nonVoters) > neededNonVoters + nonVoters, liveNonVoters := d.countLive(ReplicaDescriptor.IsNonVoter, liveFunc) + + res.UnderReplicatedNonVoters = liveNonVoters < neededNonVoters + res.OverReplicatedNonVoters = nonVoters > neededNonVoters + return res } +// ReplicationMargin computes the amount by which live voters exceed quorum. Values >= 0 indicate an available range, +// as well as the number of additional replicas we can lose before losing quorum. +func (d ReplicaSet) ReplicationMargin(liveFunc func(descriptor ReplicaDescriptor) bool) int { + // This functions handles regular, or joint-consensus replica groups. In the + // joint-consensus case, we'll independently consider the health of the + // outgoing group ("old") and the incoming group ("new"). In the regular case, + // the two groups will be identical. + + oldGroup, oldLive := d.countLive(ReplicaDescriptor.IsVoterOldConfig, liveFunc) + newGroup, newLive := d.countLive(ReplicaDescriptor.IsVoterNewConfig, liveFunc) + + marginIncomingGroup := newLive - (newGroup/2 + 1) + marginOutgoingGroup := marginIncomingGroup + // Empty groups succeed by default, to match the Raft implementation. + if oldGroup > 0 { + marginOutgoingGroup = oldLive - (oldGroup/2 + 1) + } + return min(marginIncomingGroup, marginOutgoingGroup) +} + // Empty returns true if `target` is an empty replication target. func Empty(target ReplicationTarget) bool { return target == ReplicationTarget{} diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 3994c6704706..dd8b1022c726 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -179,6 +179,7 @@ go_library( "//pkg/server/decommissioning", "//pkg/server/diagnostics", "//pkg/server/diagnostics/diagnosticspb", + "//pkg/server/faulttolerance", "//pkg/server/goroutinedumper", "//pkg/server/license", "//pkg/server/pgurl", diff --git a/pkg/server/faulttolerance/BUILD.bazel b/pkg/server/faulttolerance/BUILD.bazel new file mode 100644 index 000000000000..abf155b16bba --- /dev/null +++ b/pkg/server/faulttolerance/BUILD.bazel @@ -0,0 +1,27 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "faulttolerance", + srcs = ["locality.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/server/faulttolerance", + visibility = ["//visibility:public"], + deps = [ + "//pkg/gossip", + "//pkg/kv/kvserver", + "//pkg/kv/kvserver/liveness/livenesspb", + "//pkg/roachpb", + "//pkg/util/protoutil", + "@com_github_cockroachdb_errors//:errors", + ], +) + +go_test( + name = "faulttolerance_test", + srcs = ["locality_test.go"], + embed = [":faulttolerance"], + deps = [ + "//pkg/kv/kvserver/liveness/livenesspb", + "//pkg/roachpb", + "@com_github_stretchr_testify//require", + ], +) diff --git a/pkg/server/faulttolerance/locality.go b/pkg/server/faulttolerance/locality.go new file mode 100644 index 000000000000..881336534bc2 --- /dev/null +++ b/pkg/server/faulttolerance/locality.go @@ -0,0 +1,213 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +// Package faulttolerance provides visibility into the cluster's ability +// to tolerate failures across different failure domains. +package faulttolerance + +import ( + "context" + "iter" + "maps" + "slices" + "strings" + + "github.com/cockroachdb/cockroach/pkg/gossip" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/protoutil" + "github.com/cockroachdb/errors" +) + +// LocalityMapsFromGossip constructs a map from each NodeID to its +// Locality, by deserializing the node descriptors from Gossip. +func LocalityMapsFromGossip(g *gossip.Gossip) (map[roachpb.NodeID]roachpb.Locality, error) { + nodeLocality := make(map[roachpb.NodeID]roachpb.Locality) + if err := g.IterateInfos(gossip.KeyNodeDescPrefix, func(key string, info gossip.Info) error { + bs, err := info.Value.GetBytes() + if err != nil { + return errors.NewAssertionErrorWithWrappedErrf(err, + "failed to extract bytes for key %q", key) + } + + var d roachpb.NodeDescriptor + if err := protoutil.Unmarshal(bs, &d); err != nil { + return errors.NewAssertionErrorWithWrappedErrf(err, + "failed to parse value for key %q", key) + } + + // Don't use node descriptors with NodeID 0, because that's meant to + // indicate that the node has been removed from the cluster. + if d.NodeID != 0 { + nodeLocality[d.NodeID] = d.Locality + } + return nil + }); err != nil { + return nil, err + } + return nodeLocality, nil +} + +// ComputeFaultTolerance computes the fault tolerance margin for each +// failure domain. A failure domain is a set of nodes that map to a +// single locality in the nodeLocality map, or a merged set of nodes +// that share a locality prefix. The fault tolerance margin is the +// number of additional nodes (beyond the nodes in the failure domain) +// that can fail before any range experiences unavailability. +// +// Margins can be negative (indicating that the failure domain is +// critical; its failure will cause unavailability), 0 (the failure +// domain is not critical, but no additional node failures can be +// tolerated), or positive. +// +// The keys in the returned map are locality strings, as returned by +// `roachpb.Locality.String()`. They include "parent localities" as +// well as the actual leaf localities to which nodes are assigned. +// +// The fault tolerance computation occurs in the context of the +// livenessMap (to determine any existing node failures), and is +// evaluated for only the replicas that appear in the replicas Seq. +// +// The evaluation for each replica is independent: it is valid to merge +// the results of this computation for different sets of replicas by +// taking the `min` of the margin values. +func ComputeFaultTolerance( + ctx context.Context, + livenessMap livenesspb.NodeVitalityMap, + nodeLocality map[roachpb.NodeID]roachpb.Locality, + replicas iter.Seq[*kvserver.Replica], +) (map[string]int, error) { + return computeFaultToleranceImpl(ctx, livenessMap, nodeLocality, replicas) +} + +type replicaDescriber interface { + Desc() *roachpb.RangeDescriptor +} + +func computeFaultToleranceImpl[RD replicaDescriber]( + ctx context.Context, + livenessMap livenesspb.NodeVitalityMap, + nodeLocality map[roachpb.NodeID]roachpb.Locality, + replicas iter.Seq[RD], +) (map[string]int, error) { + for n, l := range nodeLocality { + if len(l.Tiers) == 0 { + return nil, errors.AssertionFailedf("node %d missing locality", n) + } + } + failureDomainMap := makeFailureDomains(nodeLocality) + + domainMargins := make(map[string]int) + for replica := range replicas { + for domainKey, fd := range failureDomainMap { + if err := ctx.Err(); err != nil { + return nil, err + } + + margin := replica.Desc().Replicas().ReplicationMargin(func(rd roachpb.ReplicaDescriptor) bool { + if _, ok := fd.nodes[rd.NodeID]; ok { + return false + } + return livenessMap[rd.NodeID].IsLive(livenesspb.SpanConfigConformance) + }) + + if oldMargin, ok := domainMargins[domainKey]; ok { + domainMargins[domainKey] = min(oldMargin, margin) + } else { + domainMargins[domainKey] = margin + } + } + } + return domainMargins, nil +} + +func makeFailureDomains( + nodeLocality map[roachpb.NodeID]roachpb.Locality, +) map[string]*failureDomain { + domainMap := make(map[string]*failureDomain) + var unresolved []string + // First, construct the leaf domains. + for _, l := range nodeLocality { + k := l.String() + if _, ok := domainMap[k]; !ok { + domainMap[k] = newFailureDomain(l, nodeLocality) + } + unresolved = append(unresolved, k) + } + + // Sort the domains by descending length. In case the depth of the + // locality tree varies, we want to handle the taller trees first. + slices.SortStableFunc(unresolved, func(a, b string) int { + aComma := strings.Count(a, ",") + bComma := strings.Count(b, ",") + return bComma - aComma + }) + + // Merge existing domains into parent domains. + for len(unresolved) > 0 { + fd := domainMap[unresolved[0]] + unresolved = unresolved[1:] + + pdKey := fd.parentKey() + if pdKey == "" { + continue + } + if parentFailureDomain, ok := domainMap[pdKey]; !ok { + // new unresolved parent domain + pd := fd.parent() + domainMap[pdKey] = pd + unresolved = append(unresolved, pdKey) + } else { + // merge child into parent domain + parentFailureDomain.merge(fd) + } + } + return domainMap +} + +type failureDomain struct { + domain roachpb.Locality + nodes map[roachpb.NodeID]struct{} +} + +func (fd *failureDomain) merge(rhs *failureDomain) { + if match, _ := rhs.domain.Matches(fd.domain); !match { + panic("cannot merge failure domain") + } + + for n := range rhs.nodes { + fd.nodes[n] = struct{}{} + } +} + +func (fd *failureDomain) parentKey() string { + return roachpb.Locality{Tiers: fd.domain.Tiers[0 : len(fd.domain.Tiers)-1]}.String() +} + +func newFailureDomain( + domain roachpb.Locality, nodeLocality map[roachpb.NodeID]roachpb.Locality, +) *failureDomain { + faultScenario := make(map[roachpb.NodeID]struct{}) + for node := range nodeLocality { + if match, _ := nodeLocality[node].Matches(domain); match { + faultScenario[node] = struct{}{} + } + } + return &failureDomain{ + domain: domain, + nodes: faultScenario, + } +} + +func (fd *failureDomain) parent() *failureDomain { + pd := roachpb.Locality{Tiers: fd.domain.Tiers[0 : len(fd.domain.Tiers)-1]} + nodes := make(map[roachpb.NodeID]struct{}, len(fd.nodes)) + maps.Copy(nodes, fd.nodes) + return &failureDomain{ + domain: pd, + nodes: nodes, + } +} diff --git a/pkg/server/faulttolerance/locality_test.go b/pkg/server/faulttolerance/locality_test.go new file mode 100644 index 000000000000..7777d3e7609e --- /dev/null +++ b/pkg/server/faulttolerance/locality_test.go @@ -0,0 +1,340 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package faulttolerance + +import ( + "context" + "iter" + "testing" + + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/stretchr/testify/require" +) + +// testReplica is a mock implementation of the replicaDescriber interface. +type testReplica struct { + desc *roachpb.RangeDescriptor +} + +// Desc implements the replicaDescriber interface. +func (r *testReplica) Desc() *roachpb.RangeDescriptor { + return r.desc +} + +// createReplicaDescriptor creates a ReplicaDescriptor with the given node ID. +func createReplicaDescriptor(nodeID roachpb.NodeID) roachpb.ReplicaDescriptor { + return roachpb.ReplicaDescriptor{ + NodeID: nodeID, + Type: roachpb.VOTER_FULL, + } +} + +// createTestReplica creates a testReplica with the given replica descriptors. +func createTestReplica(replicas ...roachpb.ReplicaDescriptor) *testReplica { + desc := &roachpb.RangeDescriptor{ + InternalReplicas: replicas, + } + return &testReplica{desc: desc} +} + +// TestMakeFailureDomains tests the makeFailureDomains function, which is the core +// of the ComputeFaultTolerance function. +func TestMakeFailureDomains(t *testing.T) { + tests := []struct { + name string + nodeLocalities map[roachpb.NodeID]roachpb.Locality + expectedDomains []string + }{ + { + name: "single region", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + }, + expectedDomains: []string{ + "region=us-east", + }, + }, + { + name: "multiple regions", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + }, + expectedDomains: []string{ + "region=us-east", + "region=us-west", + "region=eu-west", + }, + }, + { + name: "hierarchical localities", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{ + {Key: "region", Value: "us-east"}, + {Key: "zone", Value: "us-east-1"}, + }}, + 2: {Tiers: []roachpb.Tier{ + {Key: "region", Value: "us-east"}, + {Key: "zone", Value: "us-east-2"}, + }}, + 3: {Tiers: []roachpb.Tier{ + {Key: "region", Value: "us-west"}, + {Key: "zone", Value: "us-west-1"}, + }}, + }, + expectedDomains: []string{ + "region=us-east", + "region=us-west", + "region=us-east,zone=us-east-1", + "region=us-east,zone=us-east-2", + "region=us-west,zone=us-west-1", + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // Call makeFailureDomains with the test case's nodeLocalities + domains := makeFailureDomains(tc.nodeLocalities) + + // Check that all expected domains are present + for _, expectedDomain := range tc.expectedDomains { + _, ok := domains[expectedDomain] + require.True(t, ok, "Expected domain %q not found", expectedDomain) + } + + // Check that there are no unexpected domains + require.Equal(t, len(tc.expectedDomains), len(domains), "Unexpected number of domains") + }) + } +} + +func TestComputeFaultTolerance(t *testing.T) { + tests := []struct { + name string + nodeLocalities map[roachpb.NodeID]roachpb.Locality + livenessMap livenesspb.NodeVitalityMap + replicas []*testReplica + expected map[string]int + }{ + { + name: "all nodes live, single region", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + }, + livenessMap: livenesspb.NodeVitalityMap{ + 1: livenesspb.FakeNodeVitality(true), + 2: livenesspb.FakeNodeVitality(true), + 3: livenesspb.FakeNodeVitality(true), + }, + replicas: []*testReplica{ + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(3), + ), + }, + expected: map[string]int{ + "region=us-east": -2, // All nodes in same region, so failure of region means no quorum + }, + }, + { + name: "all nodes live, multiple regions", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + }, + livenessMap: livenesspb.NodeVitalityMap{ + 1: livenesspb.FakeNodeVitality(true), + 2: livenesspb.FakeNodeVitality(true), + 3: livenesspb.FakeNodeVitality(true), + }, + replicas: []*testReplica{ + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(3), + ), + }, + expected: map[string]int{ + "region=us-east": 0, // Can lose us-east and still have quorum + "region=us-west": 0, // Can lose us-west and still have quorum + "region=eu-west": 0, // Can lose eu-west and still have quorum + }, + }, + { + name: "some nodes dead, multiple regions", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + 4: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + }, + livenessMap: livenesspb.NodeVitalityMap{ + 1: livenesspb.FakeNodeVitality(true), + 2: livenesspb.FakeNodeVitality(true), + 3: livenesspb.FakeNodeVitality(false), // Node 3 is dead + 4: livenesspb.FakeNodeVitality(true), + }, + replicas: []*testReplica{ + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(3), + ), + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(4), + ), + }, + expected: map[string]int{ + "region=us-east": -1, // With node 3 down, can't lose the replica in us-east + "region=us-west": -1, // With node 3 down, can't lose the replica in us-west + "region=eu-west": 0, // We can tolerate node 3 and 4 down, since no range needs both nodes + }, + }, + { + name: "hierarchical localities", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{ + {Key: "region", Value: "us-east"}, + {Key: "zone", Value: "us-east-1"}, + }}, + 2: {Tiers: []roachpb.Tier{ + {Key: "region", Value: "us-east"}, + {Key: "zone", Value: "us-east-2"}, + }}, + 3: {Tiers: []roachpb.Tier{ + {Key: "region", Value: "us-west"}, + {Key: "zone", Value: "us-west-1"}, + }}, + }, + livenessMap: livenesspb.NodeVitalityMap{ + 1: livenesspb.FakeNodeVitality(true), + 2: livenesspb.FakeNodeVitality(true), + 3: livenesspb.FakeNodeVitality(true), + }, + replicas: []*testReplica{ + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(3), + ), + }, + expected: map[string]int{ + "region=us-east": -1, // Losing us-east means losing 2 nodes, which breaks quorum + "region=us-west": 0, // Can lose us-west and still have quorum + "region=us-east,zone=us-east-1": 0, // Can lose us-east-1 and still have quorum + "region=us-east,zone=us-east-2": 0, // Can lose us-east-2 and still have quorum + "region=us-west,zone=us-west-1": 0, // Can lose us-west-1 and still have quorum + }, + }, + { + name: "multiple replicas with different margins", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + 4: {Tiers: []roachpb.Tier{{Key: "region", Value: "ap-east"}}}, + }, + livenessMap: livenesspb.NodeVitalityMap{ + 1: livenesspb.FakeNodeVitality(true), + 2: livenesspb.FakeNodeVitality(true), + 3: livenesspb.FakeNodeVitality(true), + 4: livenesspb.FakeNodeVitality(false), + }, + replicas: []*testReplica{ + // First replica with nodes 1, 2, 3 + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(3), + ), + // Second replica with nodes 2, 3, 4 + createTestReplica( + createReplicaDescriptor(2), + createReplicaDescriptor(3), + createReplicaDescriptor(4), + ), + }, + expected: map[string]int{ + "region=us-east": 0, // Can lose us-east in both replicas + "region=us-west": -1, // Losing us-west would break quorum in second replica + "region=eu-west": -1, // Losing eu-west would break quorum in second replica + "region=ap-east": 0, // Can lose ap-east in both replicas + }, + }, + { + name: "region plus node fault tolerance", + nodeLocalities: map[roachpb.NodeID]roachpb.Locality{ + 1: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 2: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 3: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-east"}}}, + 4: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 5: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 6: {Tiers: []roachpb.Tier{{Key: "region", Value: "us-west"}}}, + 7: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + 8: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + 9: {Tiers: []roachpb.Tier{{Key: "region", Value: "eu-west"}}}, + }, + livenessMap: livenesspb.TestCreateNodeVitality(1, 2, 3, 4, 5, 6, 7, 8, 9).ScanNodeVitalityFromCache(), + replicas: []*testReplica{ + createTestReplica( + createReplicaDescriptor(1), + createReplicaDescriptor(2), + createReplicaDescriptor(3), + createReplicaDescriptor(4), + createReplicaDescriptor(5), + createReplicaDescriptor(6), + createReplicaDescriptor(7), + createReplicaDescriptor(8), + createReplicaDescriptor(9), + ), + }, + expected: map[string]int{ + "region=us-east": 1, // Can lose each region, and one other node at RF=9 + "region=us-west": 1, + "region=eu-west": 1, + }, + }, + } + + // Create a function that satisfies the iter.Seq[replicaDescriber] interface + // by using the testReplica objects directly. + mockReplicaIterator := func(replicas []*testReplica) iter.Seq[replicaDescriber] { + return func(yield func(replicaDescriber) bool) { + for _, r := range replicas { + if !yield(r) { + break + } + } + } + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ctx := context.Background() + + // Create a sequence of replicas for testing + replicaSeq := mockReplicaIterator(tc.replicas) + + // Call computeFaultToleranceImpl directly + result, err := computeFaultToleranceImpl(ctx, tc.livenessMap, tc.nodeLocalities, replicaSeq) + require.NoError(t, err) + + // Verify the results + require.Equal(t, tc.expected, result) + }) + } +} diff --git a/pkg/server/node.go b/pkg/server/node.go index d605760f3ab1..cd7d6b53241e 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -10,6 +10,7 @@ import ( "context" "fmt" "io" + "iter" "math" "net" "sort" @@ -40,6 +41,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/multitenant/tenantcapabilities/tenantcapabilitieswatcher" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc/rpcbase" + "github.com/cockroachdb/cockroach/pkg/server/faulttolerance" "github.com/cockroachdb/cockroach/pkg/server/license" "github.com/cockroachdb/cockroach/pkg/server/status" "github.com/cockroachdb/cockroach/pkg/server/tenantsettingswatcher" @@ -59,6 +61,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/iterutil" "github.com/cockroachdb/cockroach/pkg/util/limit" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/log/eventpb" @@ -83,6 +86,7 @@ import ( "github.com/cockroachdb/logtags" "github.com/cockroachdb/pebble" "github.com/cockroachdb/redact" + io_prometheus_client "github.com/prometheus/client_model/go" "google.golang.org/grpc/codes" grpcstatus "google.golang.org/grpc/status" ) @@ -194,6 +198,18 @@ This metric is thus not an indicator of KV health.`, Measurement: "Bytes", Unit: metric.Unit_BYTES, } + metaFaultTolerance = metric.Metadata{ + Name: "fault_tolerance.nodes", + Help: `Number of nodes that can fail before we lose quorum on any range, + if the labeled failure domain goes down. 0 indicates that we can tolerate a fault + in the failure domain, but only just. Negative values indicate that the failure + domain is critical to continued availability for at least one range.`, + Measurement: "Nodes", + Unit: metric.Unit_COUNT, + Category: metric.Metadata_REPLICATION, + MetricType: io_prometheus_client.MetricType_GAUGE, + HowToUse: "Take the minimum, faceted across all labels", + } ) // Cluster settings. @@ -263,6 +279,7 @@ type nodeMetrics struct { // Note that there could be multiple buffered senders in a node. BufferedSenderMetrics *rangefeed.BufferedSenderMetrics LockedMuxStreamMetrics *rangefeed.LockedMuxStreamMetrics + FaultTolerance *metric.GaugeVec } func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) *nodeMetrics { @@ -285,6 +302,7 @@ func makeNodeMetrics(reg *metric.Registry, histogramWindow time.Duration) *nodeM StreamManagerMetrics: rangefeed.NewStreamManagerMetrics(), BufferedSenderMetrics: rangefeed.NewBufferedSenderMetrics(), LockedMuxStreamMetrics: rangefeed.NewLockedMuxStreamMetrics(histogramWindow), + FaultTolerance: metric.NewExportedGaugeVec(metaFaultTolerance, []string{"Locality"}), } for i := range nm.MethodCounts { @@ -1237,11 +1255,92 @@ func (n *Node) computeMetricsPeriodically( } return nil }) + if err != nil { + return err + } + n.updateNodeRangeCount() n.storeCfg.KVFlowStreamTokenProvider.UpdateMetricGauges() + err = n.updateFaultToleranceMetrics(ctx) return err } +// Replicas returns an iter.Seq over this node's Replicas. +func (n *Node) Replicas() iter.Seq[*kvserver.Replica] { + return func(yield func(*kvserver.Replica) bool) { + keepGoing := true + _ = n.stores.VisitStores(func(store *kvserver.Store) error { + store.VisitReplicas(func(replica *kvserver.Replica) bool { + keepGoing = yield(replica) + return keepGoing + }) + if !keepGoing { + return iterutil.StopIteration() + } + return nil + }) + } +} + +// StoreReplicas returns an iter.Seq2 over this node's Store, Replica pairs. +func (n *Node) StoreReplicas() iter.Seq2[*kvserver.Store, *kvserver.Replica] { + return func(yield func(*kvserver.Store, *kvserver.Replica) bool) { + keepGoing := true + _ = n.stores.VisitStores(func(store *kvserver.Store) error { + store.VisitReplicas(func(replica *kvserver.Replica) bool { + keepGoing = yield(store, replica) + return keepGoing + }) + if !keepGoing { + return iterutil.StopIteration() + } + return nil + }) + } +} + +// updateFaultToleranceMetrics updates the FaultTolerance gauges. +func (n *Node) updateFaultToleranceMetrics(ctx context.Context) error { + localityMap, err := faulttolerance.LocalityMapsFromGossip(n.storeCfg.Gossip) + if err != nil { + return err + } + + // Check that all nodes have a locality set. + nodesWithLocality := 0 + nodesMissingLocality := 0 + for _, l := range localityMap { + if len(l.Tiers) > 0 { + nodesWithLocality++ + } else { + nodesMissingLocality++ + } + } + if nodesWithLocality == 0 { + // Locality isn't set anywhere; this metric is meaningless in this cluster. + return nil + } + if nodesMissingLocality > 0 { + log.Warningf(ctx, "skipping fault tolerance metric update: some nodes missing locality") + return nil + } + + if domainMargins, err := faulttolerance.ComputeFaultTolerance( + ctx, + n.storeCfg.NodeLiveness.ScanNodeVitalityFromCache(), + localityMap, + n.Replicas(), + ); err == nil { + for domainKey, margin := range domainMargins { + n.metrics.FaultTolerance.Update(map[string]string{"Locality": domainKey}, int64(margin)) + } + } else { + return err + } + + return nil +} + // UpdateIOThreshold relays the supplied IOThreshold to the same method on the // designated Store. func (n *Node) UpdateIOThreshold(id roachpb.StoreID, threshold *admissionpb.IOThreshold) {