Skip to content

Commit 79e136a

Browse files
craig[bot]pav-kv
andcommitted
Merge #148752
148752: loqrecovery: fixup and comment on sep-raft-log r=arulajmani a=pav-kv Epic: CRDB-49111 Co-authored-by: Pavel Kalinnikov <[email protected]>
2 parents aa1b367 + 97f32df commit 79e136a

File tree

3 files changed

+24
-9
lines changed

3 files changed

+24
-9
lines changed

pkg/kv/kvserver/loqrecovery/apply.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,8 @@ func applyReplicaUpdate(
317317

318318
hs.LeadEpoch = 0
319319

320+
// TODO(sep-raft-log): when raft and state machine engines are separated, this
321+
// update must be written to the raft engine.
320322
if err := sl.SetHardState(ctx, readWriter, hs); err != nil {
321323
return PrepareReplicaReport{}, errors.Wrap(err, "setting HardState")
322324
}

pkg/kv/kvserver/loqrecovery/collect.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,10 @@ func CollectStoresReplicaInfo(
158158
return loqrecoverypb.ClusterReplicaInfo{}, CollectionStats{}, errors.New("can't collect info from stored that belong to different clusters")
159159
}
160160
nodes[ident.NodeID] = struct{}{}
161-
if err := visitStoreReplicas(ctx, reader, ident.StoreID, ident.NodeID, version,
161+
// TODO(sep-raft-log): use different readers when the raft and state machine
162+
// engines are separate. Since the engines are immutable in this path, there
163+
// is no question whether to and in which order to grab engine snapshots.
164+
if err := visitStoreReplicas(ctx, reader, reader, ident.StoreID, ident.NodeID,
162165
func(info loqrecoverypb.ReplicaInfo) error {
163166
replicas = append(replicas, info)
164167
return nil
@@ -178,19 +181,21 @@ func CollectStoresReplicaInfo(
178181

179182
func visitStoreReplicas(
180183
ctx context.Context,
181-
reader storage.Reader,
184+
state, raft storage.Reader,
182185
storeID roachpb.StoreID,
183186
nodeID roachpb.NodeID,
184-
targetVersion clusterversion.ClusterVersion,
185187
send func(info loqrecoverypb.ReplicaInfo) error,
186188
) error {
187-
if err := kvstorage.IterateRangeDescriptorsFromDisk(ctx, reader, func(desc roachpb.RangeDescriptor) error {
189+
if err := kvstorage.IterateRangeDescriptorsFromDisk(ctx, state, func(desc roachpb.RangeDescriptor) error {
188190
rsl := stateloader.Make(desc.RangeID)
189-
rstate, err := rsl.Load(ctx, reader, &desc)
191+
rstate, err := rsl.Load(ctx, state, &desc)
190192
if err != nil {
191193
return err
192194
}
193-
hstate, err := rsl.LoadHardState(ctx, reader)
195+
// TODO(pav-kv): the LoQ recovery flow uses only the applied index, and the
196+
// HardState.Commit loaded here is unused. Consider removing. Make sure this
197+
// doesn't break compatibility for ReplicaInfo unmarshalling.
198+
hstate, err := rsl.LoadHardState(ctx, raft)
194199
if err != nil {
195200
return err
196201
}
@@ -199,8 +204,12 @@ func visitStoreReplicas(
199204
// at potentially uncommitted entries as we have no way to determine their
200205
// outcome, and they will become committed as soon as the replica is
201206
// designated as a survivor.
207+
// TODO(sep-raft-log): decide which LogID to read from. If the raft and
208+
// state machine readers are slightly out of sync, the LogIDs may mismatch.
209+
// For the heuristics here, it would probably make sense to read from all
210+
// LogIDs with unapplied entries.
202211
rangeUpdates, err := GetDescriptorChangesFromRaftLog(
203-
ctx, desc.RangeID, rstate.RaftAppliedIndex+1, math.MaxInt64, reader)
212+
ctx, desc.RangeID, rstate.RaftAppliedIndex+1, math.MaxInt64, raft)
204213
if err != nil {
205214
return err
206215
}

pkg/kv/kvserver/loqrecovery/server.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ func (s Server) ServeLocalReplicas(
121121
_ *serverpb.RecoveryCollectLocalReplicaInfoRequest,
122122
stream serverpb.RPCAdmin_RecoveryCollectLocalReplicaInfoStream,
123123
) error {
124-
v := s.settings.Version.ActiveVersion(ctx)
125124
var stores []*kvserver.Store
126125
if err := s.stores.VisitStores(func(s *kvserver.Store) error {
127126
stores = append(stores, s)
@@ -135,9 +134,14 @@ func (s Server) ServeLocalReplicas(
135134
for _, s := range stores {
136135
s := s // copy for closure
137136
g.Go(func() error {
137+
// TODO(sep-raft-log): when raft and state machine engines are separate,
138+
// we need two snapshots here. This path is online, so we should make sure
139+
// these snapshots are consistent (in particular, the LogID must match
140+
// across the two), or make sure that LogID mismatch is handled in
141+
// visitStoreReplicas.
138142
reader := s.TODOEngine().NewSnapshot()
139143
defer reader.Close()
140-
return visitStoreReplicas(ctx, reader, s.StoreID(), s.NodeID(), v,
144+
return visitStoreReplicas(ctx, reader, reader, s.StoreID(), s.NodeID(),
141145
func(info loqrecoverypb.ReplicaInfo) error {
142146
return syncStream.Send(&serverpb.RecoveryCollectLocalReplicaInfoResponse{ReplicaInfo: &info})
143147
})

0 commit comments

Comments
 (0)