-
Notifications
You must be signed in to change notification settings - Fork 3.9k
kvserver: clear RHS when learning about a split through a snapshot #148781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ package kvserver | |
|
||
import ( | ||
"context" | ||
"sort" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/keys" | ||
|
@@ -583,6 +584,10 @@ func (r *Replica) applySnapshotRaftMuLocked( | |
subsumedDescs = append(subsumedDescs, sr.Desc()) | ||
} | ||
|
||
// NB: subsumedDescs in prepareSnapApplyInput must be sorted by start key. | ||
sort.Slice(subsumedDescs, func(i, j int) bool { | ||
return subsumedDescs[i].StartKey.Compare(subsumedDescs[j].StartKey) < 0 | ||
}) | ||
st := r.ClusterSettings() | ||
prepInput := prepareSnapApplyInput{ | ||
id: r.ID(), | ||
|
@@ -595,16 +600,20 @@ func (r *Replica) applySnapshotRaftMuLocked( | |
truncState: truncState, | ||
hardState: hs, | ||
desc: desc, | ||
origDesc: r.descRLocked(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tripped the mutex assertion here, since Consider Also, how does this work when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We probably don't need to lock in the first place. |
||
subsumedDescs: subsumedDescs, | ||
} | ||
|
||
_ = applySnapshotTODO | ||
clearedUnreplicatedSpan, clearedSubsumedSpans, err := prepareSnapApply(ctx, prepInput) | ||
clearedUnreplicatedSpan, clearedSubsumedSpans, clearedSplitSpans, err := prepareSnapApply( | ||
ctx, prepInput, | ||
) | ||
if err != nil { | ||
return err | ||
} | ||
clearedSpans = append(clearedSpans, clearedUnreplicatedSpan) | ||
clearedSpans = append(clearedSpans, clearedSubsumedSpans...) | ||
clearedSpans = append(clearedSpans, clearedSplitSpans...) | ||
|
||
ls := r.asLogStorage() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,42 +30,97 @@ type prepareSnapApplyInput struct { | |
sl stateloader.StateLoader | ||
writeSST func(context.Context, []byte) error | ||
|
||
truncState kvserverpb.RaftTruncatedState | ||
hardState raftpb.HardState | ||
desc *roachpb.RangeDescriptor | ||
truncState kvserverpb.RaftTruncatedState | ||
hardState raftpb.HardState | ||
desc *roachpb.RangeDescriptor // corresponds to the range descriptor in the snapshot | ||
origDesc *roachpb.RangeDescriptor // pre-snapshot range desciptor | ||
// NB: subsumedDescs, if set, must be in sorted (by start key) order. | ||
subsumedDescs []*roachpb.RangeDescriptor | ||
} | ||
|
||
// prepareSnapApply writes the unreplicated SST for the snapshot and clears disk data for subsumed replicas. | ||
// prepareSnapApply writes the unreplicated SST for the snapshot and clears disk | ||
// data for subsumed replicas. | ||
func prepareSnapApply( | ||
ctx context.Context, input prepareSnapApplyInput, | ||
) ( | ||
roachpb.Span, // clearedUnreplicatedSpan | ||
[]roachpb.Span, // clearedSubsumedSpans | ||
error, | ||
clearedUnreplicatedSpan roachpb.Span, | ||
clearedSubsumedSpans []roachpb.Span, | ||
clearedSplitSpans []roachpb.Span, | ||
_ error, | ||
) { | ||
_ = applySnapshotTODO // 3.1 + 1.1 + 2.5. | ||
unreplicatedSSTFile, clearedUnreplicatedSpan, err := writeUnreplicatedSST( | ||
ctx, input.id, input.st, input.truncState, input.hardState, input.sl, | ||
) | ||
if err != nil { | ||
return roachpb.Span{}, nil, err | ||
return roachpb.Span{}, nil, nil, err | ||
} | ||
_ = applySnapshotTODO // add to 2.4. | ||
if err := input.writeSST(ctx, unreplicatedSSTFile.Data()); err != nil { | ||
return roachpb.Span{}, nil, err | ||
return roachpb.Span{}, nil, nil, err | ||
} | ||
|
||
_ = applySnapshotTODO // 3.2 + 2.1 + 2.2 + 2.3 | ||
clearedSubsumedSpans, err := clearSubsumedReplicaDiskData( | ||
clearedSubsumedSpans, err = clearSubsumedReplicaDiskData( | ||
ctx, input.st, input.todoEng, input.writeSST, | ||
input.desc, input.subsumedDescs, | ||
) | ||
if err != nil { | ||
return roachpb.Span{}, nil, err | ||
return roachpb.Span{}, nil, nil, err | ||
} | ||
|
||
return clearedUnreplicatedSpan, clearedSubsumedSpans, nil | ||
rightMostDesc := input.origDesc | ||
if len(input.subsumedDescs) != 0 { | ||
// NB: We might have to create SSTs for the range local keys, lock table | ||
// keys, and user keys depending on if the subsumed replicas are not fully | ||
// contained by the replica in our snapshot. The following is an example to | ||
// this case happening. | ||
// | ||
// a b c d | ||
// |---1---|-------2-------| S1 | ||
// |---1-------------------| S2 | ||
// |---1-----------|---3---| S3 | ||
// | ||
// Since the merge is the first operation to happen, a follower could be | ||
// down before it completes. The range could then split, and it is | ||
// reasonable for S1 to learn about both these operations via a snapshot for | ||
// r1 from S3. In the general case, this can lead to the following | ||
// situation[*] where subsumed replicas may extend past the snapshot: | ||
// | ||
// [a----------------s1---...---sn) | ||
// [a---------------------b) | ||
// | ||
// So, we need to additionally clear [b,sn). This is handled by the call to | ||
// clearSplitReplicaDiskData below by correctly passing in the | ||
// rightMostDesc. Also see the commentary on that function. | ||
// | ||
// [*] s1, ..., sn are the end keys for the subsumed replicas (for the | ||
// current keySpan). | ||
rightMostDesc = input.subsumedDescs[len(input.subsumedDescs)-1] | ||
|
||
// In the other case, where the subsumed replicas are fully contained in the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Likely makes sense to move the shorter/longer logic up the stack from |
||
// snapshot: | ||
// | ||
// [a---s1---...---sn) | ||
// [a---------------------b) | ||
// | ||
// We don't need to clear additional keyspace here, since clearing [a,b) | ||
// will also clear the keyspace owned by all the subsumed replicas. Any | ||
// other per-replica key spans were cleared by the call to | ||
// clearSubsumedReplicaDiskData above. | ||
// TODO(arul): ^ during review, lookling for suggestions for more precise | ||
// wording for this. | ||
} | ||
|
||
clearedSplitSpans, err = clearSplitReplicaDiskData( | ||
ctx, input.st, input.todoEng, input.writeSST, | ||
input.desc, rightMostDesc, | ||
) | ||
if err != nil { | ||
return roachpb.Span{}, nil, nil, err | ||
} | ||
|
||
return clearedUnreplicatedSpan, clearedSubsumedSpans, clearedSplitSpans, nil | ||
} | ||
|
||
// writeUnreplicatedSST creates an SST for snapshot application that | ||
|
@@ -150,7 +205,8 @@ func rewriteRaftState( | |
// the Reader reflects the latest I/O each of the subsumed replicas has done | ||
// (i.e. Reader was instantiated after all raftMu were acquired). | ||
// | ||
// NB: does nothing if subsumedDescs is empty. | ||
// NB: does nothing if subsumedDescs is empty. If non-empty, the caller must | ||
// ensure that subsumedDescs is sorted in ascending order by start key. | ||
func clearSubsumedReplicaDiskData( | ||
ctx context.Context, | ||
st *cluster.Settings, | ||
|
@@ -159,20 +215,23 @@ func clearSubsumedReplicaDiskData( | |
desc *roachpb.RangeDescriptor, | ||
subsumedDescs []*roachpb.RangeDescriptor, | ||
) (clearedSpans []roachpb.Span, _ error) { | ||
if len(subsumedDescs) == 0 { | ||
return // no subsumed replicas to speak of; early return | ||
} | ||
// NB: The snapshot must never subsume a replica that extends the range of the | ||
// replica to the left. This is because splits and merges (the only | ||
// operation that change the key bounds) always leave the start key intact. | ||
// Extending to the left implies that either we merged "to the left" (we | ||
// don't), or that we're applying a snapshot for another range (we don't do | ||
// that either). Something is severely wrong for this to happen, so perform | ||
// a sanity check. | ||
if subsumedDescs[0].StartKey.Compare(desc.StartKey) < 0 { // subsumedDescs are sorted by StartKey | ||
log.Fatalf(ctx, "subsuming replica to our left; key span: %v; total key span %v", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The log line no longer prints key spans. Update the wording? |
||
subsumedDescs[0].StartKey, desc.StartKey) | ||
} | ||
|
||
// NB: we don't clear RangeID local key spans here. That happens | ||
// via the call to DestroyReplica. | ||
getKeySpans := func(d *roachpb.RangeDescriptor) []roachpb.Span { | ||
return rditer.Select(d.RangeID, rditer.SelectOpts{ | ||
Ranged: rditer.SelectRangedOptions{ | ||
RSpan: d.RSpan(), | ||
SystemKeys: true, | ||
UserKeys: true, | ||
LockTable: true, | ||
}, | ||
}) | ||
} | ||
keySpans := getKeySpans(desc) | ||
totalKeySpans := append([]roachpb.Span(nil), keySpans...) | ||
for _, subDesc := range subsumedDescs { | ||
// We have to create an SST for the subsumed replica's range-id local keys. | ||
subsumedReplSSTFile := &storage.MemObject{} | ||
|
@@ -206,92 +265,90 @@ func clearSubsumedReplicaDiskData( | |
return nil, err | ||
} | ||
} | ||
|
||
srKeySpans := getKeySpans(subDesc) | ||
// Compute the total key space covered by the current replica and all | ||
// subsumed replicas. | ||
for i := range srKeySpans { | ||
if srKeySpans[i].Key.Compare(totalKeySpans[i].Key) < 0 { | ||
totalKeySpans[i].Key = srKeySpans[i].Key | ||
} | ||
if srKeySpans[i].EndKey.Compare(totalKeySpans[i].EndKey) > 0 { | ||
totalKeySpans[i].EndKey = srKeySpans[i].EndKey | ||
} | ||
} | ||
} | ||
|
||
// We might have to create SSTs for the range local keys, lock table keys, | ||
// and user keys depending on if the subsumed replicas are not fully | ||
// contained by the replica in our snapshot. The following is an example to | ||
// this case happening. | ||
// | ||
// a b c d | ||
// |---1---|-------2-------| S1 | ||
// |---1-------------------| S2 | ||
// |---1-----------|---3---| S3 | ||
// | ||
// Since the merge is the first operation to happen, a follower could be down | ||
// before it completes. It is reasonable for a snapshot for r1 from S3 to | ||
// subsume both r1 and r2 in S1. | ||
for i := range keySpans { | ||
// The snapshot must never subsume a replica that extends the range of the | ||
// replica to the left. This is because splits and merges (the only | ||
// operation that change the key bounds) always leave the start key intact. | ||
// Extending to the left implies that either we merged "to the left" (we | ||
// don't), or that we're applying a snapshot for another range (we don't do | ||
// that either). Something is severely wrong for this to happen. | ||
if totalKeySpans[i].Key.Compare(keySpans[i].Key) < 0 { | ||
log.Fatalf(ctx, "subsuming replica to our left; key span: %v; total key span %v", | ||
keySpans[i], totalKeySpans[i]) | ||
} | ||
|
||
// In the comments below, s1, ..., sn are the end keys for the subsumed | ||
// replicas (for the current keySpan). | ||
// Note that if there aren't any subsumed replicas (the common case), the | ||
// next comparison is always zero and this loop is a no-op. | ||
return clearedSpans, nil | ||
} | ||
|
||
if totalKeySpans[i].EndKey.Compare(keySpans[i].EndKey) <= 0 { | ||
// The subsumed replicas are fully contained in the snapshot: | ||
// | ||
// [a---s1---...---sn) | ||
// [a---------------------b) | ||
// | ||
// We don't need to clear additional keyspace here, since clearing `[a,b)` | ||
// will also clear all subsumed replicas. | ||
continue | ||
} | ||
// clearSplitReplicaDiskData clears the on disk data of any replica that has | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested below how to make this func a lower level / straightforward primitive. Correspondingly, these comments might need to move one level above to the caller. |
||
// been split, and we've learned of this split when applying the snapshot | ||
// because the snapshot narrows the range. We clear on-disk data by creating | ||
// SSTs with range deletion tombstones. | ||
// | ||
// The supplied rightMostDesc corresponds to the right-most descriptor on the | ||
// store that overlaps with the descriptor in the snapshot. In the simplest | ||
// case, where a replica (LHS) learns about the split through the snapshot, this | ||
// is the descriptor of the replica itself: | ||
// | ||
// original descriptor: [a-----------------------------c) | ||
// snapshot descriptor: [a---------------------b) | ||
// | ||
// The more involved case is when one or more replicas have been merged into the | ||
// LHS before the split, and the LHS is learning about all of these through the | ||
// snapshot -- in this case, the rightMostDesc corresponds to the right-most | ||
// subsumed replica: | ||
// | ||
// store descriptors: [a----------------s1---...---sn) | ||
// snapshot descriptor: [a---------------------b) | ||
// | ||
// In the diagram above, S1...Sn correspond to subsumed replicas with end keys | ||
// s1...sn respectively. These are all replicas on the store that overlap with | ||
// the snapshot descriptor, covering the range [a,b), and the rightMostDesc is | ||
// the replica Sn. | ||
// | ||
// clearSplitReplicaDiskState is a no-op if there is no split, i.e, the | ||
// rightMostDesc.EndKey indicates that it's narrower than the snapshot's | ||
// descriptor. | ||
func clearSplitReplicaDiskData( | ||
ctx context.Context, | ||
st *cluster.Settings, | ||
reader storage.Reader, | ||
writeSST func(context.Context, []byte) error, | ||
snapDesc *roachpb.RangeDescriptor, | ||
rightMostDesc *roachpb.RangeDescriptor, | ||
Comment on lines
+307
to
+308
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of these 2 descriptors, the only information we need/use are the two for _, span := range rditer.Select(... {
RSpan: <this "delta" RSpan>
}) {
_ = span // generate SST
} I started doing this in https://github.com/cockroachdb/cockroach/pull/147378/files#diff-dd96d448893ec224dc7a1ca0e9056a6ec9d8aa830b80e1b6a44e2e3a8eab29abR262-R266, but it fell through the cracks. |
||
) (clearedSpans []roachpb.Span, _ error) { | ||
if rightMostDesc.EndKey.Compare(snapDesc.EndKey) <= 0 { | ||
return // no-op, no split | ||
} | ||
|
||
// The subsumed replicas extend past the snapshot: | ||
// | ||
// [a----------------s1---...---sn) | ||
// [a---------------------b) | ||
// | ||
// We need to additionally clear [b,sn). | ||
getKeySpans := func(d *roachpb.RangeDescriptor) []roachpb.Span { | ||
return rditer.Select(d.RangeID, rditer.SelectOpts{ | ||
Ranged: rditer.SelectRangedOptions{ | ||
RSpan: d.RSpan(), | ||
SystemKeys: true, | ||
UserKeys: true, | ||
LockTable: true, | ||
}, | ||
}) | ||
} | ||
keySpans := getKeySpans(snapDesc) | ||
origKeySpans := getKeySpans(rightMostDesc) | ||
|
||
subsumedReplSSTFile := &storage.MemObject{} | ||
subsumedReplSST := storage.MakeIngestionSSTWriter( | ||
ctx, st, subsumedReplSSTFile, | ||
for i := range origKeySpans { | ||
rhsReplSSTFile := &storage.MemObject{} | ||
rhsReplSST := storage.MakeIngestionSSTWriter( | ||
ctx, st, rhsReplSSTFile, | ||
) | ||
if err := storage.ClearRangeWithHeuristic( | ||
ctx, | ||
reader, | ||
&subsumedReplSST, | ||
&rhsReplSST, | ||
keySpans[i].EndKey, | ||
totalKeySpans[i].EndKey, | ||
origKeySpans[i].EndKey, | ||
kvstorage.ClearRangeThresholdPointKeys, | ||
); err != nil { | ||
subsumedReplSST.Close() | ||
rhsReplSST.Close() | ||
return nil, err | ||
} | ||
clearedSpans = append(clearedSpans, | ||
roachpb.Span{Key: keySpans[i].EndKey, EndKey: totalKeySpans[i].EndKey}) | ||
if err := subsumedReplSST.Finish(); err != nil { | ||
roachpb.Span{Key: keySpans[i].EndKey, EndKey: origKeySpans[i].EndKey}) | ||
if err := rhsReplSST.Finish(); err != nil { | ||
return nil, err | ||
} | ||
if subsumedReplSST.DataSize > 0 { | ||
// TODO(itsbilal): Write to SST directly in subsumedReplSST rather than | ||
if rhsReplSST.DataSize > 0 { | ||
// TODO(arul): write to SST directly in rhsReplSST rather than | ||
// buffering in a MemObject first. | ||
if err := writeSST(ctx, subsumedReplSSTFile.Data()); err != nil { | ||
if err := writeSST(ctx, rhsReplSSTFile.Data()); err != nil { | ||
return nil, err | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks unnecessary. This slice is already sorted, by construction here. If you like, assert
slices.IsSorted
here, or inprepareSnapApply
?The code in
clearSubsumedReplicaDiskData
might have been misleading before, making an impression that this slice can be unsorted.