From 1291582d9267844bcb096d0c6e7c1c959fa2eb5b Mon Sep 17 00:00:00 2001 From: becomeStar Date: Mon, 5 Jan 2026 00:03:02 +0900 Subject: [PATCH 1/4] xds: Initiate proactive connection on failure Implement proactive connection logic in RingHashLoadBalancer as outlined in gRFC A61. This address the missing logic where the balancer should initialize the first IDLE child when a child balancer reports TRANSIENT_FAILURE and no other children are connecting. This behavior, which was previously present before #10610, ensures that a backup subchannel starts connecting immediately outside of the picker flow, reducing failover latency. Fixes #12024 --- .../io/grpc/xds/RingHashLoadBalancer.java | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java b/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java index 21ee914ff8f..169c7308ab4 100644 --- a/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java +++ b/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java @@ -214,12 +214,42 @@ protected void updateOverallBalancingState() { overallState = TRANSIENT_FAILURE; } + // gRFC A61: trigger lazy initialization outside picker when no READY child exists, + // at least one child is in TRANSIENT_FAILURE, and no child is currently CONNECTING + maybeTriggerIdleChildConnection(numReady, numTF, numConnecting, numIdle); + RingHashPicker picker = new RingHashPicker(syncContext, ring, getChildLbStates(), requestHashHeaderKey, random); getHelper().updateBalancingState(overallState, picker); this.currentConnectivityState = overallState; } + /** + * Triggers lazy initialization of an IDLE child pickFirst load balancer when: + * + * + * + *

This corresponds to the second lazy-initialization condition described in gRFC A61, where + * recovery from TRANSIENT_FAILURE must be triggered outside the picker when no active connection + * attempt is in progress. + */ + private void maybeTriggerIdleChildConnection( + int numReady, int numTF, int numConnecting, int numIdle) { + if (numReady == 0 && numTF > 0 && numConnecting == 0 && numIdle > 0) { + for (ChildLbState child : getChildLbStates()) { + if (child.getCurrentState() == ConnectivityState.IDLE) { + child.getLb().requestConnection(); + return; + } + } + } + } + @Override protected ChildLbState createChildLbState(Object key) { return new ChildLbState(key, lazyLbFactory); From 7d4a9614be9ceb0b3754b753a3a2ef9be34bd966 Mon Sep 17 00:00:00 2001 From: becomeStar Date: Mon, 5 Jan 2026 00:19:04 +0900 Subject: [PATCH 2/4] xds: Update and add tests for proactive connection Update existing unit tests and add new test cases to validate the proactive connection behavior. Verification counts in several tests have been adjusted to reflect that the balancer now initiates connections immediately upon subchannel failure. --- .../io/grpc/xds/RingHashLoadBalancerTest.java | 62 ++++++++++++++----- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java b/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java index d65cf96c00d..174c7058c5a 100644 --- a/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java +++ b/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java @@ -255,7 +255,7 @@ public void aggregateSubchannelStates_connectingReadyIdleFailure() { inOrder.verify(helper).refreshNameResolution(); inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any()); } - verifyConnection(0); + verifyConnection(1); } private void verifyConnection(int times) { @@ -537,7 +537,7 @@ public void pickWithRandomHash_firstSubchannelInTransientFailure_remainingSubcha // Bring one subchannel to TRANSIENT_FAILURE. deliverSubchannelUnreachable(getSubChannel(servers.get(0))); verify(helper).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(1); // Pick subchannel with random hash does trigger connection by walking the ring // and choosing the first (at most one) IDLE subchannel along the way. @@ -583,7 +583,7 @@ public void skipFailingHosts_pickNextNonFailingHost() { getSubChannel(servers.get(0)), ConnectivityStateInfo.forTransientFailure( Status.UNAVAILABLE.withDescription("unreachable"))); - verify(helper).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); + verify(helper, atLeastOnce()).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); PickResult result = pickerCaptor.getValue().pickSubchannel(args); assertThat(result.getStatus().isOk()).isTrue(); @@ -649,7 +649,7 @@ public void skipFailingHosts_firstTwoHostsFailed_pickNextFirstReady() { ConnectivityStateInfo.forTransientFailure( Status.PERMISSION_DENIED.withDescription("permission denied"))); verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(2); PickResult result = pickerCaptor.getValue().pickSubchannel(args); // activate last subchannel assertThat(result.getStatus().isOk()).isTrue(); int expectedCount = PickFirstLoadBalancerProvider.isEnabledNewPickFirst() ? 0 : 1; @@ -721,7 +721,7 @@ public void allSubchannelsInTransientFailure() { } verify(helper, atLeastOnce()) .updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(2); // Picking subchannel triggers connection. RPC hash hits server0. PickSubchannelArgs args = getDefaultPickSubchannelArgsForServer(0); @@ -740,12 +740,13 @@ public void firstSubchannelIdle() { List servers = createWeightedServerAddrs(1, 1, 1); initializeLbSubchannels(config, servers); - // Go to TF does nothing, though PF will try to reconnect after backoff + // As per gRFC A61, entering TF triggers a proactive connection attempt + // on an IDLE subchannel because no other subchannel is currently CONNECTING. deliverSubchannelState(getSubchannel(servers, 1), ConnectivityStateInfo.forTransientFailure( Status.UNAVAILABLE.withDescription("unreachable"))); verify(helper).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(1); // Picking subchannel triggers connection. RPC hash hits server0. PickSubchannelArgs args = getDefaultPickSubchannelArgs(hashFunc.hashVoid()); @@ -796,7 +797,7 @@ public void firstSubchannelFailure() { ConnectivityStateInfo.forTransientFailure( Status.UNAVAILABLE.withDescription("unreachable"))); verify(helper).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(1); // Per GRFC A61 Picking subchannel should no longer request connections that were failing PickSubchannelArgs args = getDefaultPickSubchannelArgs(hashFunc.hashVoid()); @@ -805,8 +806,6 @@ public void firstSubchannelFailure() { assertThat(result.getStatus().isOk()).isTrue(); assertThat(result.getSubchannel()).isNull(); verify(subchannelList.get(0), never()).requestConnection(); // In TF - verify(subchannelList.get(1)).requestConnection(); - verify(subchannelList.get(2), never()).requestConnection(); // Not one of the first 2 } @Test @@ -824,7 +823,7 @@ public void secondSubchannelConnecting() { Subchannel firstSubchannel = getSubchannel(servers, 0); deliverSubchannelUnreachable(firstSubchannel); - verifyConnection(0); + verifyConnection(1); deliverSubchannelState(getSubchannel(servers, 2), CSI_CONNECTING); verify(helper, times(2)).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); @@ -833,7 +832,7 @@ public void secondSubchannelConnecting() { // Picking subchannel when idle triggers connection. deliverSubchannelState(getSubchannel(servers, 2), ConnectivityStateInfo.forNonError(IDLE)); - verifyConnection(0); + verifyConnection(1); PickSubchannelArgs args = getDefaultPickSubchannelArgs(hashFunc.hashVoid()); PickResult result = pickerCaptor.getValue().pickSubchannel(args); assertThat(result.getStatus().isOk()).isTrue(); @@ -857,7 +856,7 @@ public void secondSubchannelFailure() { deliverSubchannelUnreachable(firstSubchannel); deliverSubchannelUnreachable(getSubchannel(servers, 2)); verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(2); // Picking subchannel triggers connection. PickSubchannelArgs args = getDefaultPickSubchannelArgs(hashFunc.hashVoid()); @@ -887,7 +886,7 @@ public void thirdSubchannelConnecting() { deliverSubchannelState(getSubchannel(servers, 1), CSI_CONNECTING); verify(helper, atLeastOnce()) .updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(2); // Picking subchannel should not trigger connection per gRFC A61. PickSubchannelArgs args = getDefaultPickSubchannelArgs(hashFunc.hashVoid()); @@ -909,7 +908,7 @@ public void stickyTransientFailure() { deliverSubchannelUnreachable(firstSubchannel); verify(helper).updateBalancingState(eq(CONNECTING), pickerCaptor.capture()); - verifyConnection(0); + verifyConnection(1); reset(helper); deliverSubchannelState(firstSubchannel, ConnectivityStateInfo.forNonError(IDLE)); @@ -1127,6 +1126,39 @@ public void config_equalsTester() { .testEquals(); } + @Test + public void tfWithoutConnectingChild_triggersIdleChildConnection() { + RingHashConfig config = new RingHashConfig(10, 100, ""); + List servers = createWeightedServerAddrs(1, 1); + + initializeLbSubchannels(config, servers); + + Subchannel tfSubchannel = getSubchannel(servers, 0); + Subchannel idleSubchannel = getSubchannel(servers, 1); + + deliverSubchannelUnreachable(tfSubchannel); + + Subchannel requested = connectionRequestedQueue.poll(); + assertThat(requested).isSameInstanceAs(idleSubchannel); + assertThat(connectionRequestedQueue.poll()).isNull(); + } + + @Test + public void tfWithReadyChild_doesNotTriggerIdleChildConnection() { + RingHashConfig config = new RingHashConfig(10, 100, ""); + List servers = createWeightedServerAddrs(1, 1, 1); + + initializeLbSubchannels(config, servers); + + Subchannel tfSubchannel = getSubchannel(servers, 0); + Subchannel readySubchannel = getSubchannel(servers, 1); + + deliverSubchannelState(readySubchannel, ConnectivityStateInfo.forNonError(READY)); + deliverSubchannelUnreachable(tfSubchannel); + + assertThat(connectionRequestedQueue.poll()).isNull(); + } + private List initializeLbSubchannels(RingHashConfig config, List servers, InitializationFlags... initFlags) { From f22dd7b11c2dc18cb84ec5ede36d10046598c689 Mon Sep 17 00:00:00 2001 From: becomeStar Date: Tue, 6 Jan 2026 00:01:24 +0900 Subject: [PATCH 3/4] xds: Update javadoc for ringHash proactive connection logic --- xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java b/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java index 169c7308ab4..5ed0c50a8ec 100644 --- a/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java +++ b/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java @@ -225,7 +225,8 @@ protected void updateOverallBalancingState() { } /** - * Triggers lazy initialization of an IDLE child pickFirst load balancer when: + * Triggers proactive connection of an IDLE child load balancer when the following conditions are + * met. * *

* - *

This corresponds to the second lazy-initialization condition described in gRFC A61, where - * recovery from TRANSIENT_FAILURE must be triggered outside the picker when no active connection - * attempt is in progress. + *

This corresponds to the proactive connection logic described in gRFC A61, where recovery + * from TRANSIENT_FAILURE must be triggered outside the picker when no active connection attempt + * is in progress. */ private void maybeTriggerIdleChildConnection( int numReady, int numTF, int numConnecting, int numIdle) { From f5a4934a9549bfb7bca0991685fd094e7ac4c620 Mon Sep 17 00:00:00 2001 From: becomeStar Date: Tue, 6 Jan 2026 14:49:05 +0900 Subject: [PATCH 4/4] xds: Address review comments for RingHash proactive connection --- .../io/grpc/xds/RingHashLoadBalancer.java | 37 +++++++------------ .../io/grpc/xds/RingHashLoadBalancerTest.java | 2 + 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java b/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java index 5ed0c50a8ec..513f4d643ea 100644 --- a/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java +++ b/xds/src/main/java/io/grpc/xds/RingHashLoadBalancer.java @@ -214,9 +214,12 @@ protected void updateOverallBalancingState() { overallState = TRANSIENT_FAILURE; } - // gRFC A61: trigger lazy initialization outside picker when no READY child exists, - // at least one child is in TRANSIENT_FAILURE, and no child is currently CONNECTING - maybeTriggerIdleChildConnection(numReady, numTF, numConnecting, numIdle); + // gRFC A61: if the aggregated connectivity state is TRANSIENT_FAILURE or CONNECTING and + // there are no endpoints in CONNECTING state, the ring_hash policy will choose one of + // the endpoints in IDLE state (if any) to trigger a connection attempt on + if (numReady == 0 && numTF > 0 && numConnecting == 0 && numIdle > 0) { + triggerIdleChildConnection(); + } RingHashPicker picker = new RingHashPicker(syncContext, ring, getChildLbStates(), requestHashHeaderKey, random); @@ -224,29 +227,15 @@ protected void updateOverallBalancingState() { this.currentConnectivityState = overallState; } + /** - * Triggers proactive connection of an IDLE child load balancer when the following conditions are - * met. - * - *

- * - *

This corresponds to the proactive connection logic described in gRFC A61, where recovery - * from TRANSIENT_FAILURE must be triggered outside the picker when no active connection attempt - * is in progress. + * Triggers a connection attempt for the first IDLE child load balancer. */ - private void maybeTriggerIdleChildConnection( - int numReady, int numTF, int numConnecting, int numIdle) { - if (numReady == 0 && numTF > 0 && numConnecting == 0 && numIdle > 0) { - for (ChildLbState child : getChildLbStates()) { - if (child.getCurrentState() == ConnectivityState.IDLE) { - child.getLb().requestConnection(); - return; - } + private void triggerIdleChildConnection() { + for (ChildLbState child : getChildLbStates()) { + if (child.getCurrentState() == ConnectivityState.IDLE) { + child.getLb().requestConnection(); + return; } } } diff --git a/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java b/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java index 174c7058c5a..b515ed81158 100644 --- a/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java +++ b/xds/src/test/java/io/grpc/xds/RingHashLoadBalancerTest.java @@ -806,6 +806,8 @@ public void firstSubchannelFailure() { assertThat(result.getStatus().isOk()).isTrue(); assertThat(result.getSubchannel()).isNull(); verify(subchannelList.get(0), never()).requestConnection(); // In TF + verify(subchannelList.get(1)).requestConnection(); + verify(subchannelList.get(2), never()).requestConnection(); // Not one of the first 2 } @Test