apache · ivandika3 · Feb 26, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/ratis-docs/src/site/markdown/configurations.md b/ratis-docs/src/site/markdown/configurations.md
@@ -220,11 +220,38 @@ if it fails to receive any RPC responses from this peer within this specified ti
 
 ### Read Index - Configurations related to ReadIndex used in linearizable read 
 
-| **Property**    | `raft.server.read.read-index.applied-index.enabled`                   |
-|:----------------|:----------------------------------------------------------------------|
-| **Description** | whether applied index (instead of commit index) is used for ReadIndex |
-| **Type**        | boolean                                                               |
-| **Default**     | false                                                                 |
+| **Property**    | `raft.server.read.read-index.type`                                           |
+|:----------------|:-----------------------------------------------------------------------------|
+| **Description** | type of read index returned                                                  |
+| **Type**        | enum `Read.ReadIndex.Type` [`COMMIT_INDEX`, `APPLIED_INDEX`, `REPLIED_INDEX` |
+| **Default**     | `Read.ReadIndex.Type.COMMIT_INDEX`                                           |
+
+* `Read.ReadIndex.Type.COMMIT_INDEX` - Use leader's CommitIndex (see Raft Paper section 6.4)
+    * The safest type as it is specified in the Raft dissertation
+    * This ReadIndex type can be chosen if the base linearizable read from followers performance already meets expectations.
+
+* `Read.ReadIndex.Type.APPLIED_INDEX` - Use leader's AppliedIndex
+    * Allow leader to return AppliedIndex (instead of CommitIndex) as the ReadIndex
+    * This reduces the time follower applying logs up to ReadIndex since AppliedIndex ≤ CommitIndex
+    * This ReadIndex type can be chosen `Read.ReadIndex.Type.COMMIT_INDEX` read latency is too high.
+
+* `Read.ReadIndex.Type.REPLIED_INDEX` - Use leader's RepliedIndex
+    * RepliedIndex is defined as the AppliedIndex of the last write request replied by the leader.  
+    * Leader delays replying write requests and only reply them every write batch boundary configurable by `raft.server.read.read-index.replied-index.batch-interval`.
+    * This allows the ReadIndex to advance in a coarser, less frequent steps, so followers are more likely to have already applied past the ReadIndex when a read arrives.
+    * This is most effective on read-heavy, follower-read workloads which prioritizes overall read throughput without consistency sacrifice.
+    * There is a trade-off in increased write latency (up to one `raft.server.read.read-index.replied-index.batch-interval`) per write.
+    * RepliedIndex still guarantees linearizability (no stale read) since by definition each ReadIndex returns the index of the last replied requests.
+    * If the RepliedIndex is set to 0, the behavior is identical to `Read.ReadIndex.Type.APPLIED_INDEX`
+
+Note that theoretically all the ReadIndex types still guarantee linearizability, 
+but there are tradeoffs (e.g. Write and Read performance) between different types.
+
+| **Property**    | `raft.server.read.read-index.replied-index.batch-interval`                                                                                   |
+|:----------------|:---------------------------------------------------------------------------------------------------------------------------------------------|
+| **Description** | if `Read.ReadIndex.Type` is `REAPLIED_INDEX`, the interval at which held write replies are flushed to clients and `repliedIndex` is advanced |
+| **Type**        | TimeDuration                                                                                                                                 |
+| **Default**     | 10ms                                                                                                                                         |
 
 | **Property**    | `raft.server.read.leader.heartbeat-check.enabled` |
 |:----------------|:--------------------------------------------------|

diff --git a/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java b/ratis-server-api/src/main/java/org/apache/ratis/server/RaftServerConfigKeys.java
@@ -280,15 +280,34 @@ static void setWriteIndexCacheExpiryTime(RaftProperties properties, TimeDuration
     interface ReadIndex {
       String PREFIX = Read.PREFIX + ".read-index";
 
-      String APPLIED_INDEX_ENABLED_KEY = PREFIX + ".applied-index.enabled";
-      boolean APPLIED_INDEX_ENABLED_DEFAULT = false;
-      static boolean appliedIndexEnabled(RaftProperties properties) {
-        return getBoolean(properties::getBoolean, APPLIED_INDEX_ENABLED_KEY,
-            APPLIED_INDEX_ENABLED_DEFAULT, getDefaultLog());
+      enum Type {
+        /** ReadIndex returns leader's commitIndex (see Raft Paper section 6.4). */
+        COMMIT_INDEX,
+
+        /** ReadIndex returns leader's appliedIndex to reduce the ReadIndex latency. */
+        APPLIED_INDEX,
+
+        /** ReadIndex returns leader's repliedIndex, the index of the last replied request. */
+        REPLIED_INDEX
+      }
+
+      String TYPE_KEY = PREFIX + ".type";
+      Type TYPE_DEFAULT = Type.COMMIT_INDEX;
+      static Type type(RaftProperties properties) {
+        return get(properties::getEnum, TYPE_KEY, TYPE_DEFAULT, getDefaultLog());
+      }
+      static void setType(RaftProperties properties, Type type) {
+        set(properties::setEnum, TYPE_KEY, type);
       }
 
-      static void setAppliedIndexEnabled(RaftProperties properties, boolean enabled) {
-        setBoolean(properties::setBoolean, APPLIED_INDEX_ENABLED_KEY, enabled);
+      String REPLIED_INDEX_BATCH_INTERVAL_KEY = PREFIX + ".replied-index.batch-interval";
+      TimeDuration REPLIED_INDEX_BATCH_INTERVAL_DEFAULT = TimeDuration.valueOf(10, TimeUnit.MILLISECONDS);
+      static TimeDuration repliedIndexBatchInterval(RaftProperties properties) {
+        return getTimeDuration(properties.getTimeDuration(REPLIED_INDEX_BATCH_INTERVAL_DEFAULT.getUnit()),
+            REPLIED_INDEX_BATCH_INTERVAL_KEY, REPLIED_INDEX_BATCH_INTERVAL_DEFAULT, getDefaultLog());
+      }
+      static void setRepliedIndexBatchInterval(RaftProperties properties, TimeDuration interval) {
+        setTimeDuration(properties::setTimeDuration, REPLIED_INDEX_BATCH_INTERVAL_KEY, interval);
       }
     }
   }

diff --git a/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java b/ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java
@@ -39,6 +39,7 @@
 import org.apache.ratis.protocol.exceptions.ReadIndexException;
 import org.apache.ratis.protocol.exceptions.ReconfigurationTimeoutException;
 import org.apache.ratis.server.RaftServerConfigKeys;
+import org.apache.ratis.server.RaftServerConfigKeys.Read.ReadIndex.Type;
 import org.apache.ratis.server.impl.ReadIndexHeartbeats.AppendEntriesListener;
 import org.apache.ratis.server.leader.FollowerInfo;
 import org.apache.ratis.server.leader.LeaderState;
@@ -80,8 +81,11 @@
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+
 import java.util.function.LongSupplier;
 import java.util.function.Predicate;
+import java.util.function.Supplier;
 import java.util.function.ToLongFunction;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -224,6 +228,19 @@ CompletableFuture<Void> stopAll() {
     }
   }
 
+  /** A write reply that has been built but not yet sent to the client */
+  private static class HeldReply {
+    private final PendingRequest pending;
+    private final RaftClientReply reply;
+    private final long index;
+
+    HeldReply(PendingRequest pending, RaftClientReply reply, long index) {
+      this.pending = pending;
+      this.reply = reply;
+      this.index = index;
+    }
+  }
+
   /** For caching {@link FollowerInfo}s.  This class is immutable. */
   static class CurrentOldFollowerInfos {
     private final RaftConfigurationImpl conf;
@@ -353,10 +370,23 @@ boolean isApplied() {
   private final PendingStepDown pendingStepDown;
 
   private final ReadIndexHeartbeats readIndexHeartbeats;
-  private final boolean readIndexAppliedIndexEnabled;
+  private final RaftServerConfigKeys.Read.ReadIndex.Type readIndexType;
+  private final Supplier<Long> readIndexSupplier;
+  private final MemoizedSupplier<String> readIndexLogPrefixSupplier;
   private final boolean leaderHeartbeatCheckEnabled;
   private final LeaderLease lease;
 
+  /** The interval at which held write replies are flushed. */
+  private final TimeDuration repliedIndexBatchInterval;
+  /** The highest log index for which a write reply has been flushed (sent to the client). */
+  private final AtomicLong repliedIndex;
+  /** Guards {@link #heldReplies}. */
+  private final Object heldRepliesLock = new Object();
+  /** Buffer holding write replies waiting to be flushed. Guarded by {@link #heldRepliesLock}. */
+  private List<HeldReply> heldReplies = new ArrayList<>();
+  /** Daemon thread that periodically flushes held replies. */
+  private volatile Daemon replyFlusher;
+
   LeaderStateImpl(RaftServerImpl server) {
     this.name = ServerStringUtils.generateUnifiedName(server.getMemberId(), getClass());
     this.server = server;
@@ -391,8 +421,30 @@ boolean isApplied() {
     } else {
       this.followerMaxGapThreshold = (long) (followerGapRatioMax * maxPendingRequests);
     }
-    this.readIndexAppliedIndexEnabled = RaftServerConfigKeys.Read.ReadIndex
-        .appliedIndexEnabled(properties);
+    this.readIndexType = RaftServerConfigKeys.Read.ReadIndex.type(properties);
+
+    this.repliedIndexBatchInterval =
+        RaftServerConfigKeys.Read.ReadIndex.repliedIndexBatchInterval(properties);
+    this.repliedIndex = new AtomicLong(state.getLastAppliedIndex());
+
+    switch (readIndexType) {
+    case REPLIED_INDEX:
+      readIndexSupplier = repliedIndex::get;
+      readIndexLogPrefixSupplier = MemoizedSupplier.valueOf(() -> "replied");
+      this.replyFlusher = Daemon.newBuilder()
+          .setName(name + "-ReplyFlusher")
+          .setRunnable(this::runReplyFlusher)
+          .build();
+      break;
+    case APPLIED_INDEX:
+      readIndexSupplier = () -> server.getState().getLastAppliedIndex();
+      readIndexLogPrefixSupplier = MemoizedSupplier.valueOf(() -> "applied");
+      break;
+    case COMMIT_INDEX:
+    default:
+      readIndexSupplier = () -> server.getRaftLog().getLastCommittedIndex();
+      readIndexLogPrefixSupplier = MemoizedSupplier.valueOf(() -> "commit");
+    }
     this.leaderHeartbeatCheckEnabled = RaftServerConfigKeys.Read
         .leaderHeartbeatCheckEnabled(properties);
 
@@ -419,6 +471,10 @@ void start() {
     startupLogEntry.get();
     processor.start();
     senders.forEach(LogAppender::start);
+
+    if (replyFlusher != null) {
+      replyFlusher.start();
+    }
   }
 
   boolean isReady() {
@@ -453,6 +509,7 @@ CompletableFuture<Void> stop() {
     startupLogEntry.get().getAppliedIndexFuture().completeExceptionally(
         new ReadIndexException("failed to obtain read index since: ", nle));
     server.getServerRpc().notifyNotLeader(server.getMemberId().getGroupId());
+    stopReplyFlusher();
     logAppenderMetrics.unregister();
     raftServerMetrics.unregister();
     pendingRequests.close();
@@ -1140,22 +1197,21 @@ public boolean checkLeadership() {
   /**
    * Obtain the current readIndex for read only requests. See Raft paper section 6.4.
    * 1. Leader makes sure at least one log from current term is committed.
-   * 2. Leader record last committed index or applied index (depending on configuration) as readIndex.
+   * 2. Leader record last committed index or applied index or replied index (depending on configuration) as readIndex.
    * 3. Leader broadcast heartbeats to followers and waits for acknowledgements.
    * 4. If majority respond success, returns readIndex.
    * @return current readIndex.
    */
   CompletableFuture<Long> getReadIndex(Long readAfterWriteConsistentIndex) {
-    final long index = readIndexAppliedIndexEnabled ?
-        server.getState().getLastAppliedIndex() : server.getRaftLog().getLastCommittedIndex();
+    final long index = readIndexSupplier.get();
     final long readIndex;
     if (readAfterWriteConsistentIndex != null && readAfterWriteConsistentIndex > index) {
       readIndex = readAfterWriteConsistentIndex;
     } else {
       readIndex = index;
     }
     LOG.debug("readIndex={} ({}Index={}, readAfterWriteConsistentIndex={})",
-        readIndex, readIndexAppliedIndexEnabled ? "applied" : "commit",
+        readIndex, readIndexLogPrefixSupplier.get(),
         index, readAfterWriteConsistentIndex);
 
     // if group contains only one member, fast path
@@ -1218,9 +1274,73 @@ private boolean checkLeaderLease() {
   }
 
   void replyPendingRequest(TermIndex termIndex, RaftClientReply reply) {
-    pendingRequests.replyPendingRequest(termIndex, reply);
+    if (readIndexType == Type.REPLIED_INDEX) {
+      // Remove from pending map but hold the reply for batch flushing.
+      final PendingRequest pending = pendingRequests.removePendingRequest(termIndex);
+      if (pending != null) {
+        holdReply(pending, reply, termIndex.getIndex());
+      }
+    } else {
+      pendingRequests.replyPendingRequest(termIndex, reply);
+    }
+  }
+
+  /** Hold a write reply for later batch flushing. */
+  private void holdReply(PendingRequest pending, RaftClientReply reply, long index) {
+    synchronized (heldRepliesLock) {
+      heldReplies.add(new HeldReply(pending, reply, index));
+    }
   }
 
+  /** Flush all held replies and advance {@link #repliedIndex}. */
+  private void flushReplies() {
+    final List<HeldReply> toFlush;
+    synchronized (heldRepliesLock) {
+      if (heldReplies.isEmpty()) {
+        return;
+      }
+      toFlush = heldReplies;
+      heldReplies = new ArrayList<>();
+    }
+
+    long maxIndex = repliedIndex.get();
+    for (HeldReply held : toFlush) {
+      held.pending.setReply(held.reply);
+      maxIndex = Math.max(maxIndex, held.index);
+    }
+    repliedIndex.set(maxIndex);
+    LOG.debug("{}: flushed {} replies, repliedIndex={}", name, toFlush.size(), maxIndex);
+  }
+
+  /** The reply flusher daemon loop. */
+  private void runReplyFlusher() {
+    while (isRunning()) {
+      try {
+        Thread.sleep(repliedIndexBatchInterval.toLong(TimeUnit.MILLISECONDS));
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+        break;
+      }
+      flushReplies();
+    }
+    // Flush remaining on exit.
+    flushReplies();
+  }
+
+  /** Stop the reply flusher daemon. */
+  private void stopReplyFlusher() {
+    final Daemon flusher = this.replyFlusher;
+    if (flusher != null) {
+      flusher.interrupt();
+      try {
+        flusher.join(repliedIndexBatchInterval.toLong(TimeUnit.MILLISECONDS) * 2);
+      } catch (InterruptedException e) {
+        Thread.currentThread().interrupt();
+      }
+    }
+  }
+
+
   TransactionContext getTransactionContext(TermIndex termIndex) {
     return pendingRequests.getTransactionContext(termIndex);
   }

diff --git a/ratis-server/src/main/java/org/apache/ratis/server/impl/PendingRequests.java b/ratis-server/src/main/java/org/apache/ratis/server/impl/PendingRequests.java
@@ -272,6 +272,18 @@ void replyPendingRequest(TermIndex termIndex, RaftClientReply reply) {
     }
   }
 
+  /**
+   * Remove the {@link PendingRequest} for the given {@link TermIndex} without sending a reply.
+   *  @return the removed {@link PendingRequest}, or null if not found.
+   */
+  PendingRequest removePendingRequest(TermIndex termIndex) {
+    final PendingRequest pending = pendingRequests.remove(termIndex);
+    if (pending != null) {
+      Preconditions.assertEquals(termIndex, pending.getTermIndex(), "termIndex");
+    }
+    return pending;
+  }
+
   /**
    * The leader state is stopped. Send NotLeaderException to all the pending
    * requests since they have not got applied to the state machine yet.

diff --git a/ratis-server/src/test/java/org/apache/ratis/LinearizableReadTests.java b/ratis-server/src/test/java/org/apache/ratis/LinearizableReadTests.java
@@ -27,6 +27,7 @@
 import org.apache.ratis.retry.RetryPolicy;
 import org.apache.ratis.server.RaftServer;
 import org.apache.ratis.server.RaftServerConfigKeys;
+import org.apache.ratis.server.RaftServerConfigKeys.Read.ReadIndex.Type;
 import org.apache.ratis.server.impl.MiniRaftCluster;
 import org.apache.ratis.util.Slf4jUtils;
 import org.apache.ratis.util.TimeDuration;
@@ -60,7 +61,7 @@ public abstract class LinearizableReadTests<CLUSTER extends MiniRaftCluster>
 
   public abstract boolean isLeaderLeaseEnabled();
 
-  public abstract boolean readIndexAppliedIndexEnabled();
+  public abstract Type readIndexType();
 
   public abstract void assertRaftProperties(RaftProperties properties);
 
@@ -77,7 +78,7 @@ public void setup() {
     CounterStateMachine.setProperties(p);
     RaftServerConfigKeys.Read.setOption(p, LINEARIZABLE);
     RaftServerConfigKeys.Read.setLeaderLeaseEnabled(p, isLeaderLeaseEnabled());
-    RaftServerConfigKeys.Read.ReadIndex.setAppliedIndexEnabled(p, readIndexAppliedIndexEnabled());
+    RaftServerConfigKeys.Read.ReadIndex.setType(p, readIndexType());
   }
 
   @Test
@@ -143,10 +144,12 @@ static <C extends MiniRaftCluster> void runTestFollowerLinearizableRead(C cluste
 
   @Test
   public void testFollowerLinearizableReadParallel() throws Exception {
-    runWithNewCluster(LinearizableReadTests::runTestFollowerReadOnlyParallel);
+    final Type type = readIndexType();
+    runWithNewCluster(cluster -> runTestFollowerReadOnlyParallel(type, cluster));
   }
 
-  static <C extends MiniRaftCluster> void runTestFollowerReadOnlyParallel(C cluster) throws Exception {
+  static <C extends MiniRaftCluster> void runTestFollowerReadOnlyParallel(Type readIndexType, C cluster)
+      throws Exception {
     final RaftPeerId leaderId = RaftTestUtil.waitForLeader(cluster).getId();
 
     final List<RaftServer.Division> followers = cluster.getFollowers();
@@ -169,8 +172,17 @@ static <C extends MiniRaftCluster> void runTestFollowerReadOnlyParallel(C cluste
         writeReplies.add(new Reply(count, leaderClient.async().send(WAIT_AND_INCREMENT)));
         Thread.sleep(100);
 
-        assertReplyExact(count, f0Client.io().sendReadOnly(QUERY, f0));
-        f1Replies.add(new Reply(count, f1Client.async().sendReadOnly(QUERY, f1)));
+        if (readIndexType == Type.REPLIED_INDEX) {
+          // With REPLIED_INDEX the read index only advances after the leader has applied the
+          // transaction and the reply batch is flushed.  WAIT_AND_INCREMENT takes 500 ms in
+          // the state machine but we only waited 100 ms, so its reply has not been generated
+          // yet and the follower read may only see the preceding sync INCREMENT (count - 1).
+          assertReplyAtLeast(count - 1, f0Client.io().sendReadOnly(QUERY, f0));
+          f1Replies.add(new Reply(count - 1, f1Client.async().sendReadOnly(QUERY, f1)));
+        } else {
+          assertReplyExact(count, f0Client.io().sendReadOnly(QUERY, f0));
+          f1Replies.add(new Reply(count, f1Client.async().sendReadOnly(QUERY, f1)));
+        }
       }
 
       for (int i = 0; i < n; i++) {