Stabilize AdvancedShardAwarenessIT

Bouncheck · Bouncheck · commit 53f16d9e0d51 · 2025-10-14T13:38:16.000+02:00
On recent Scylla versions this test started failing periodically.
It looks like with newer Scylla the driver somehow hits a scenario where
it successfully initializes a good portion of the connections, then
all connection attempts to one of the nodes get rejected.
It is accompanied by multiple erros like this:
```
19:38:41.582 [s0-admin-1] WARN  c.d.o.d.i.core.pool.ChannelPool - [s0|/127.0.2.2:19042]  Error while opening new channel
com.datastax.oss.driver.api.core.DriverTimeoutException: [s0|id: 0xfc42b7c7, L:/127.0.0.1:11854 - R:/127.0.2.2:19042] Protocol initialization request, step 1 (OPTIONS): timed out after 5000 ms
	at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.onTimeout(ChannelHandlerRequest.java:110)
	at io.netty.util.concurrent.PromiseTask.runTask(PromiseTask.java:98)
	at io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:160)
	at io.netty.util.concurrent.AbstractEventExecutor.runTask(AbstractEventExecutor.java:173)
	at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:166)
	at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:472)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:569)
	at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:998)
	at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
	at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
	at java.base/java.lang.Thread.run(Thread.java:829)
```
Increasing delays between reconnections or even increasing the test timeout
(largest value tried was 40 seconds) does not help with this situation.
The node logs do not show anything raising suspicion. Not even a WARN.

This change lowers the number of nodes to 1 (previously 2) and the number
of expected channels per session to 33 (previously 66) in resource heavy
test methods. Number of sessions remains at 4.
The reconnection delays in `should_not_struggle_to_fill_pools` will now
start at around 300ms and should not rise above 3200ms.
This is the smallest tested set of changes that seems to resolve the issue.
The test remains meaningful since `should_struggle_to_fill_pools` still
displays considerably worse performance without adv. shard awareness.
diff --git a/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java b/integration-tests/src/test/java/com/datastax/oss/driver/core/pool/AdvancedShardAwarenessIT.java
@@ -51,7 +51,7 @@ public class AdvancedShardAwarenessIT {
 
   @ClassRule
   public static final CustomCcmRule CCM_RULE =
-      CustomCcmRule.builder().withNodes(2).withJvmArgs("--smp=3").build();
+      CustomCcmRule.builder().withNodes(1).withJvmArgs("--smp=3").build();
 
   public static ch.qos.logback.classic.Logger channelPoolLogger =
       (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(ChannelPool.class);
@@ -100,33 +100,22 @@ public void stopCapturingLogs() {
   public void should_initialize_all_channels(boolean reuseAddress) {
     int expectedChannelsPerNode = 6; // Divisible by smp
     String node1 = CCM_RULE.getCcmBridge().getNodeIpAddress(1);
-    String node2 = CCM_RULE.getCcmBridge().getNodeIpAddress(2);
     Pattern reconnectionPattern1 =
         Pattern.compile(".*" + Pattern.quote(node1) + ".*Scheduling next reconnection in.*");
-    Pattern reconnectionPattern2 =
-        Pattern.compile(".*" + Pattern.quote(node2) + ".*Scheduling next reconnection in.*");
-    Set<Pattern> forbiddenOccurences =
-        ImmutableSet.of(shardMismatchPattern, reconnectionPattern1, reconnectionPattern2);
+    Set<Pattern> forbiddenOccurences = ImmutableSet.of(shardMismatchPattern, reconnectionPattern1);
     Map<Pattern, Integer> expectedOccurences =
         ImmutableMap.of(
             Pattern.compile(
-                    ".*"
-                        + Pattern.quote(node1)
-                        + ":19042.*Reconnection attempt complete, 6/6 channels.*"),
-                1,
-            Pattern.compile(
-                    ".*"
-                        + Pattern.quote(node2)
-                        + ":19042.*Reconnection attempt complete, 6/6 channels.*"),
-                1);
+                ".*"
+                    + Pattern.quote(node1)
+                    + ":19042.*Reconnection attempt complete, 6/6 channels.*"),
+            1);
     DriverConfigLoader loader =
         SessionUtils.configLoaderBuilder()
             .withBoolean(DefaultDriverOption.SOCKET_REUSE_ADDRESS, reuseAddress)
             .withBoolean(DefaultDriverOption.CONNECTION_ADVANCED_SHARD_AWARENESS_ENABLED, true)
             .withInt(DefaultDriverOption.ADVANCED_SHARD_AWARENESS_PORT_LOW, 10000)
             .withInt(DefaultDriverOption.ADVANCED_SHARD_AWARENESS_PORT_HIGH, 60000)
-            // Due to rounding up the connections per shard this will result in 6 connections per
-            // node
             .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, expectedChannelsPerNode)
             .build();
     try (CqlSession session =
@@ -149,13 +138,13 @@ public void should_initialize_all_channels(boolean reuseAddress) {
 
   @Test
   public void should_see_mismatched_shard() {
-    int expectedChannelsPerNode = 66; // Divisible by smp
+    int expectedChannelsPerNode = 33; // Divisible by smp
     DriverConfigLoader loader =
         SessionUtils.configLoaderBuilder()
             .withBoolean(DefaultDriverOption.CONNECTION_ADVANCED_SHARD_AWARENESS_ENABLED, true)
             .withInt(DefaultDriverOption.ADVANCED_SHARD_AWARENESS_PORT_LOW, 10000)
             .withInt(DefaultDriverOption.ADVANCED_SHARD_AWARENESS_PORT_HIGH, 60000)
-            .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, 66)
+            .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, expectedChannelsPerNode)
             .build();
     try (CqlSession session =
         CqlSession.builder()
@@ -176,13 +165,13 @@ public void should_see_mismatched_shard() {
   // There is no need to run this as a test, but it serves as a comparison
   @SuppressWarnings("unused")
   public void should_struggle_to_fill_pools() {
-    int expectedChannelsPerNode = 66; // Divisible by smp
+    int expectedChannelsPerNode = 33; // Divisible by smp
     DriverConfigLoader loader =
         SessionUtils.configLoaderBuilder()
             .withBoolean(DefaultDriverOption.CONNECTION_ADVANCED_SHARD_AWARENESS_ENABLED, false)
-            .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, 66)
-            .withDuration(DefaultDriverOption.RECONNECTION_BASE_DELAY, Duration.ofMillis(200))
-            .withDuration(DefaultDriverOption.RECONNECTION_MAX_DELAY, Duration.ofMillis(4000))
+            .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, expectedChannelsPerNode)
+            .withDuration(DefaultDriverOption.RECONNECTION_BASE_DELAY, Duration.ofMillis(300))
+            .withDuration(DefaultDriverOption.RECONNECTION_MAX_DELAY, Duration.ofMillis(3200))
             .build();
     CqlSessionBuilder builder =
         CqlSession.builder()
@@ -210,13 +199,13 @@ public void should_struggle_to_fill_pools() {
 
   @Test
   public void should_not_struggle_to_fill_pools() {
-    int expectedChannelsPerNode = 66;
+    int expectedChannelsPerNode = 33;
     DriverConfigLoader loader =
         SessionUtils.configLoaderBuilder()
             .withBoolean(DefaultDriverOption.CONNECTION_ADVANCED_SHARD_AWARENESS_ENABLED, true)
             .withInt(DefaultDriverOption.CONNECTION_POOL_LOCAL_SIZE, expectedChannelsPerNode)
-            .withDuration(DefaultDriverOption.RECONNECTION_BASE_DELAY, Duration.ofMillis(10))
-            .withDuration(DefaultDriverOption.RECONNECTION_MAX_DELAY, Duration.ofMillis(20))
+            .withDuration(DefaultDriverOption.RECONNECTION_BASE_DELAY, Duration.ofMillis(300))
+            .withDuration(DefaultDriverOption.RECONNECTION_MAX_DELAY, Duration.ofMillis(3200))
             .build();
     CqlSessionBuilder builder =
         CqlSession.builder()
@@ -239,29 +228,24 @@ public void should_not_struggle_to_fill_pools() {
           .until(() -> areAllPoolsFullyInitialized(allSessions, expectedChannelsPerNode));
       int tolerance = 2; // Sometimes socket ends up already in use
       String node1 = CCM_RULE.getCcmBridge().getNodeIpAddress(1);
-      String node2 = CCM_RULE.getCcmBridge().getNodeIpAddress(2);
       Pattern reconnectionPattern1 =
           Pattern.compile(".*" + Pattern.quote(node1) + ".*Scheduling next reconnection in.*");
-      Pattern reconnectionPattern2 =
-          Pattern.compile(".*" + Pattern.quote(node2) + ".*Scheduling next reconnection in.*");
       Map<Pattern, Integer> expectedOccurences =
           ImmutableMap.of(
               Pattern.compile(
-                      ".*"
-                          + Pattern.quote(node1)
-                          + ":19042.*Reconnection attempt complete, 66/66 channels.*"),
-                  1 * sessions,
-              Pattern.compile(
-                      ".*"
-                          + Pattern.quote(node2)
-                          + ":19042.*Reconnection attempt complete, 66/66 channels.*"),
-                  1 * sessions);
+                  ".*"
+                      + Pattern.quote(node1)
+                      + ":19042.*Reconnection attempt complete, "
+                      + expectedChannelsPerNode
+                      + "/"
+                      + expectedChannelsPerNode
+                      + " channels.*"),
+              sessions);
       List<ILoggingEvent> logsCopy = ImmutableList.copyOf(appender.list);
       expectedOccurences.forEach(
           (pattern, times) -> assertMatchesAtLeast(pattern, times, logsCopy));
       assertNoLogMatches(shardMismatchPattern, logsCopy);
       assertMatchesAtMost(reconnectionPattern1, tolerance, logsCopy);
-      assertMatchesAtMost(reconnectionPattern2, tolerance, logsCopy);
     }
   }