Skip to content

kvserver: clear RHS when learning about a split through a snapshot #148781

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pkg/kv/kvserver/client_split_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4566,8 +4566,6 @@ func TestStoreRangeSplitRaftSnapshotAfterRHSRebalanced(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

skip.WithIssue(t, 73462)

ctx := context.Background()
// Start a 5 node cluster.
tc := testcluster.StartTestCluster(t, 5, base.TestClusterArgs{
Expand Down
11 changes: 10 additions & 1 deletion pkg/kv/kvserver/replica_raftstorage.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package kvserver

import (
"context"
"sort"
"time"

"github.com/cockroachdb/cockroach/pkg/keys"
Expand Down Expand Up @@ -583,6 +584,10 @@ func (r *Replica) applySnapshotRaftMuLocked(
subsumedDescs = append(subsumedDescs, sr.Desc())
}

// NB: subsumedDescs in prepareSnapApplyInput must be sorted by start key.
sort.Slice(subsumedDescs, func(i, j int) bool {
return subsumedDescs[i].StartKey.Compare(subsumedDescs[j].StartKey) < 0
})
Comment on lines +587 to +590
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks unnecessary. This slice is already sorted, by construction here. If you like, assert slices.IsSorted here, or in prepareSnapApply?

The code in clearSubsumedReplicaDiskData might have been misleading before, making an impression that this slice can be unsorted.

st := r.ClusterSettings()
prepInput := prepareSnapApplyInput{
id: r.ID(),
Expand All @@ -595,16 +600,20 @@ func (r *Replica) applySnapshotRaftMuLocked(
truncState: truncState,
hardState: hs,
desc: desc,
origDesc: r.descRLocked(),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tripped the mutex assertion here, since mu is not locked.

Consider r.Desc(). I can see at least one call to it above, so maybe reuse it to avoid extra mutex locks?

Also, how does this work when origDesc is empty? I.e. when this snapshot just initializes our replica.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably don't need to lock in the first place. Desc is in "shMu", i.e. always changed with mu and raftMu both locked, and readable when holding either of the mutexes.

subsumedDescs: subsumedDescs,
}

_ = applySnapshotTODO
clearedUnreplicatedSpan, clearedSubsumedSpans, err := prepareSnapApply(ctx, prepInput)
clearedUnreplicatedSpan, clearedSubsumedSpans, clearedSplitSpans, err := prepareSnapApply(
ctx, prepInput,
)
if err != nil {
return err
}
clearedSpans = append(clearedSpans, clearedUnreplicatedSpan)
clearedSpans = append(clearedSpans, clearedSubsumedSpans...)
clearedSpans = append(clearedSpans, clearedSplitSpans...)

ls := r.asLogStorage()

Expand Down
243 changes: 150 additions & 93 deletions pkg/kv/kvserver/snapshot_apply_prepare.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,42 +30,97 @@ type prepareSnapApplyInput struct {
sl stateloader.StateLoader
writeSST func(context.Context, []byte) error

truncState kvserverpb.RaftTruncatedState
hardState raftpb.HardState
desc *roachpb.RangeDescriptor
truncState kvserverpb.RaftTruncatedState
hardState raftpb.HardState
desc *roachpb.RangeDescriptor // corresponds to the range descriptor in the snapshot
origDesc *roachpb.RangeDescriptor // pre-snapshot range desciptor
// NB: subsumedDescs, if set, must be in sorted (by start key) order.
subsumedDescs []*roachpb.RangeDescriptor
}

// prepareSnapApply writes the unreplicated SST for the snapshot and clears disk data for subsumed replicas.
// prepareSnapApply writes the unreplicated SST for the snapshot and clears disk
// data for subsumed replicas.
func prepareSnapApply(
ctx context.Context, input prepareSnapApplyInput,
) (
roachpb.Span, // clearedUnreplicatedSpan
[]roachpb.Span, // clearedSubsumedSpans
error,
clearedUnreplicatedSpan roachpb.Span,
clearedSubsumedSpans []roachpb.Span,
clearedSplitSpans []roachpb.Span,
_ error,
) {
_ = applySnapshotTODO // 3.1 + 1.1 + 2.5.
unreplicatedSSTFile, clearedUnreplicatedSpan, err := writeUnreplicatedSST(
ctx, input.id, input.st, input.truncState, input.hardState, input.sl,
)
if err != nil {
return roachpb.Span{}, nil, err
return roachpb.Span{}, nil, nil, err
}
_ = applySnapshotTODO // add to 2.4.
if err := input.writeSST(ctx, unreplicatedSSTFile.Data()); err != nil {
return roachpb.Span{}, nil, err
return roachpb.Span{}, nil, nil, err
}

_ = applySnapshotTODO // 3.2 + 2.1 + 2.2 + 2.3
clearedSubsumedSpans, err := clearSubsumedReplicaDiskData(
clearedSubsumedSpans, err = clearSubsumedReplicaDiskData(
ctx, input.st, input.todoEng, input.writeSST,
input.desc, input.subsumedDescs,
)
if err != nil {
return roachpb.Span{}, nil, err
return roachpb.Span{}, nil, nil, err
}

return clearedUnreplicatedSpan, clearedSubsumedSpans, nil
rightMostDesc := input.origDesc
if len(input.subsumedDescs) != 0 {
// NB: We might have to create SSTs for the range local keys, lock table
// keys, and user keys depending on if the subsumed replicas are not fully
// contained by the replica in our snapshot. The following is an example to
// this case happening.
//
// a b c d
// |---1---|-------2-------| S1
// |---1-------------------| S2
// |---1-----------|---3---| S3
//
// Since the merge is the first operation to happen, a follower could be
// down before it completes. The range could then split, and it is
// reasonable for S1 to learn about both these operations via a snapshot for
// r1 from S3. In the general case, this can lead to the following
// situation[*] where subsumed replicas may extend past the snapshot:
//
// [a----------------s1---...---sn)
// [a---------------------b)
//
// So, we need to additionally clear [b,sn). This is handled by the call to
// clearSplitReplicaDiskData below by correctly passing in the
// rightMostDesc. Also see the commentary on that function.
//
// [*] s1, ..., sn are the end keys for the subsumed replicas (for the
// current keySpan).
rightMostDesc = input.subsumedDescs[len(input.subsumedDescs)-1]

// In the other case, where the subsumed replicas are fully contained in the
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likely makes sense to move the shorter/longer logic up the stack from clearSplitReplicaDiskData, so that these comments are interleaved with the checks as they happen. I made some suggestions in clearSplitReplicaDiskData how to achieve it.

// snapshot:
//
// [a---s1---...---sn)
// [a---------------------b)
//
// We don't need to clear additional keyspace here, since clearing [a,b)
// will also clear the keyspace owned by all the subsumed replicas. Any
// other per-replica key spans were cleared by the call to
// clearSubsumedReplicaDiskData above.
// TODO(arul): ^ during review, lookling for suggestions for more precise
// wording for this.
}

clearedSplitSpans, err = clearSplitReplicaDiskData(
ctx, input.st, input.todoEng, input.writeSST,
input.desc, rightMostDesc,
)
if err != nil {
return roachpb.Span{}, nil, nil, err
}

return clearedUnreplicatedSpan, clearedSubsumedSpans, clearedSplitSpans, nil
}

// writeUnreplicatedSST creates an SST for snapshot application that
Expand Down Expand Up @@ -150,7 +205,8 @@ func rewriteRaftState(
// the Reader reflects the latest I/O each of the subsumed replicas has done
// (i.e. Reader was instantiated after all raftMu were acquired).
//
// NB: does nothing if subsumedDescs is empty.
// NB: does nothing if subsumedDescs is empty. If non-empty, the caller must
// ensure that subsumedDescs is sorted in ascending order by start key.
func clearSubsumedReplicaDiskData(
ctx context.Context,
st *cluster.Settings,
Expand All @@ -159,20 +215,23 @@ func clearSubsumedReplicaDiskData(
desc *roachpb.RangeDescriptor,
subsumedDescs []*roachpb.RangeDescriptor,
) (clearedSpans []roachpb.Span, _ error) {
if len(subsumedDescs) == 0 {
return // no subsumed replicas to speak of; early return
}
// NB: The snapshot must never subsume a replica that extends the range of the
// replica to the left. This is because splits and merges (the only
// operation that change the key bounds) always leave the start key intact.
// Extending to the left implies that either we merged "to the left" (we
// don't), or that we're applying a snapshot for another range (we don't do
// that either). Something is severely wrong for this to happen, so perform
// a sanity check.
if subsumedDescs[0].StartKey.Compare(desc.StartKey) < 0 { // subsumedDescs are sorted by StartKey
log.Fatalf(ctx, "subsuming replica to our left; key span: %v; total key span %v",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The log line no longer prints key spans. Update the wording?

subsumedDescs[0].StartKey, desc.StartKey)
}

// NB: we don't clear RangeID local key spans here. That happens
// via the call to DestroyReplica.
getKeySpans := func(d *roachpb.RangeDescriptor) []roachpb.Span {
return rditer.Select(d.RangeID, rditer.SelectOpts{
Ranged: rditer.SelectRangedOptions{
RSpan: d.RSpan(),
SystemKeys: true,
UserKeys: true,
LockTable: true,
},
})
}
keySpans := getKeySpans(desc)
totalKeySpans := append([]roachpb.Span(nil), keySpans...)
for _, subDesc := range subsumedDescs {
// We have to create an SST for the subsumed replica's range-id local keys.
subsumedReplSSTFile := &storage.MemObject{}
Expand Down Expand Up @@ -206,92 +265,90 @@ func clearSubsumedReplicaDiskData(
return nil, err
}
}

srKeySpans := getKeySpans(subDesc)
// Compute the total key space covered by the current replica and all
// subsumed replicas.
for i := range srKeySpans {
if srKeySpans[i].Key.Compare(totalKeySpans[i].Key) < 0 {
totalKeySpans[i].Key = srKeySpans[i].Key
}
if srKeySpans[i].EndKey.Compare(totalKeySpans[i].EndKey) > 0 {
totalKeySpans[i].EndKey = srKeySpans[i].EndKey
}
}
}

// We might have to create SSTs for the range local keys, lock table keys,
// and user keys depending on if the subsumed replicas are not fully
// contained by the replica in our snapshot. The following is an example to
// this case happening.
//
// a b c d
// |---1---|-------2-------| S1
// |---1-------------------| S2
// |---1-----------|---3---| S3
//
// Since the merge is the first operation to happen, a follower could be down
// before it completes. It is reasonable for a snapshot for r1 from S3 to
// subsume both r1 and r2 in S1.
for i := range keySpans {
// The snapshot must never subsume a replica that extends the range of the
// replica to the left. This is because splits and merges (the only
// operation that change the key bounds) always leave the start key intact.
// Extending to the left implies that either we merged "to the left" (we
// don't), or that we're applying a snapshot for another range (we don't do
// that either). Something is severely wrong for this to happen.
if totalKeySpans[i].Key.Compare(keySpans[i].Key) < 0 {
log.Fatalf(ctx, "subsuming replica to our left; key span: %v; total key span %v",
keySpans[i], totalKeySpans[i])
}

// In the comments below, s1, ..., sn are the end keys for the subsumed
// replicas (for the current keySpan).
// Note that if there aren't any subsumed replicas (the common case), the
// next comparison is always zero and this loop is a no-op.
return clearedSpans, nil
}

if totalKeySpans[i].EndKey.Compare(keySpans[i].EndKey) <= 0 {
// The subsumed replicas are fully contained in the snapshot:
//
// [a---s1---...---sn)
// [a---------------------b)
//
// We don't need to clear additional keyspace here, since clearing `[a,b)`
// will also clear all subsumed replicas.
continue
}
// clearSplitReplicaDiskData clears the on disk data of any replica that has
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested below how to make this func a lower level / straightforward primitive. Correspondingly, these comments might need to move one level above to the caller.

// been split, and we've learned of this split when applying the snapshot
// because the snapshot narrows the range. We clear on-disk data by creating
// SSTs with range deletion tombstones.
//
// The supplied rightMostDesc corresponds to the right-most descriptor on the
// store that overlaps with the descriptor in the snapshot. In the simplest
// case, where a replica (LHS) learns about the split through the snapshot, this
// is the descriptor of the replica itself:
//
// original descriptor: [a-----------------------------c)
// snapshot descriptor: [a---------------------b)
//
// The more involved case is when one or more replicas have been merged into the
// LHS before the split, and the LHS is learning about all of these through the
// snapshot -- in this case, the rightMostDesc corresponds to the right-most
// subsumed replica:
//
// store descriptors: [a----------------s1---...---sn)
// snapshot descriptor: [a---------------------b)
//
// In the diagram above, S1...Sn correspond to subsumed replicas with end keys
// s1...sn respectively. These are all replicas on the store that overlap with
// the snapshot descriptor, covering the range [a,b), and the rightMostDesc is
// the replica Sn.
//
// clearSplitReplicaDiskState is a no-op if there is no split, i.e, the
// rightMostDesc.EndKey indicates that it's narrower than the snapshot's
// descriptor.
func clearSplitReplicaDiskData(
ctx context.Context,
st *cluster.Settings,
reader storage.Reader,
writeSST func(context.Context, []byte) error,
snapDesc *roachpb.RangeDescriptor,
rightMostDesc *roachpb.RangeDescriptor,
Comment on lines +307 to +308
Copy link
Collaborator

@pav-kv pav-kv Jun 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of these 2 descriptors, the only information we need/use are the two EndKeys that form a key span that we want to clear. So we can instead just take this span, and make this func straightforward:

for _, span := range rditer.Select(... {
	RSpan: <this "delta" RSpan>
}) {
	_ = span // generate SST
}

I started doing this in https://github.com/cockroachdb/cockroach/pull/147378/files#diff-dd96d448893ec224dc7a1ca0e9056a6ec9d8aa830b80e1b6a44e2e3a8eab29abR262-R266, but it fell through the cracks.

) (clearedSpans []roachpb.Span, _ error) {
if rightMostDesc.EndKey.Compare(snapDesc.EndKey) <= 0 {
return // no-op, no split
}

// The subsumed replicas extend past the snapshot:
//
// [a----------------s1---...---sn)
// [a---------------------b)
//
// We need to additionally clear [b,sn).
getKeySpans := func(d *roachpb.RangeDescriptor) []roachpb.Span {
return rditer.Select(d.RangeID, rditer.SelectOpts{
Ranged: rditer.SelectRangedOptions{
RSpan: d.RSpan(),
SystemKeys: true,
UserKeys: true,
LockTable: true,
},
})
}
keySpans := getKeySpans(snapDesc)
origKeySpans := getKeySpans(rightMostDesc)

subsumedReplSSTFile := &storage.MemObject{}
subsumedReplSST := storage.MakeIngestionSSTWriter(
ctx, st, subsumedReplSSTFile,
for i := range origKeySpans {
rhsReplSSTFile := &storage.MemObject{}
rhsReplSST := storage.MakeIngestionSSTWriter(
ctx, st, rhsReplSSTFile,
)
if err := storage.ClearRangeWithHeuristic(
ctx,
reader,
&subsumedReplSST,
&rhsReplSST,
keySpans[i].EndKey,
totalKeySpans[i].EndKey,
origKeySpans[i].EndKey,
kvstorage.ClearRangeThresholdPointKeys,
); err != nil {
subsumedReplSST.Close()
rhsReplSST.Close()
return nil, err
}
clearedSpans = append(clearedSpans,
roachpb.Span{Key: keySpans[i].EndKey, EndKey: totalKeySpans[i].EndKey})
if err := subsumedReplSST.Finish(); err != nil {
roachpb.Span{Key: keySpans[i].EndKey, EndKey: origKeySpans[i].EndKey})
if err := rhsReplSST.Finish(); err != nil {
return nil, err
}
if subsumedReplSST.DataSize > 0 {
// TODO(itsbilal): Write to SST directly in subsumedReplSST rather than
if rhsReplSST.DataSize > 0 {
// TODO(arul): write to SST directly in rhsReplSST rather than
// buffering in a MemObject first.
if err := writeSST(ctx, subsumedReplSSTFile.Data()); err != nil {
if err := writeSST(ctx, rhsReplSSTFile.Data()); err != nil {
return nil, err
}
}
Expand Down