Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
import org.opensearch.cluster.routing.allocation.decider.ResizeAllocationDecider;
import org.opensearch.cluster.routing.allocation.decider.RestoreInProgressAllocationDecider;
import org.opensearch.cluster.routing.allocation.decider.SameShardAllocationDecider;
import org.opensearch.cluster.routing.allocation.decider.ReplicaOnlyAllocationDecider;
import org.opensearch.cluster.routing.allocation.decider.SearchReplicaAllocationDecider;
import org.opensearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
import org.opensearch.cluster.routing.allocation.decider.SnapshotInProgressAllocationDecider;
Expand Down Expand Up @@ -396,6 +397,7 @@ public static Collection<AllocationDecider> createAllocationDeciders(
addAllocationDecider(deciders, new RestoreInProgressAllocationDecider());
addAllocationDecider(deciders, new FilterAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new SearchReplicaAllocationDecider());
addAllocationDecider(deciders, new ReplicaOnlyAllocationDecider());
addAllocationDecider(deciders, new SameShardAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new DiskThresholdDecider(settings, clusterSettings));
addAllocationDecider(deciders, new WarmDiskThresholdDecider(settings, clusterSettings));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ public static boolean isDedicatedWarmNode(Settings settings) {
return getRolesFromSettings(settings).stream().allMatch(DiscoveryNodeRole.WARM_ROLE::equals);
}

public static boolean isReplicaOnlyNode(Settings settings) {
return hasRole(settings, DiscoveryNodeRole.REPLICA_ONLY_ROLE);
}

private final String nodeName;
private final String nodeId;
private final String ephemeralId;
Expand Down Expand Up @@ -542,6 +546,15 @@ public boolean isSearchNode() {
return roles.contains(DiscoveryNodeRole.SEARCH_ROLE);
}

/**
* Returns whether the node is a replica-only node.
*
* @return true if the node contains a replica_only role, false otherwise
*/
public boolean isReplicaOnlyNode() {
return roles.contains(DiscoveryNodeRole.REPLICA_ONLY_ROLE);
}

/**
* Returns whether the node is a remote store node.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,11 +337,48 @@ public void validateRole(List<DiscoveryNodeRole> roles) {

};

/**
* Represents the role for a replica-only node, which hosts replica shards from auto-expand indices
* without participating in primary shard hosting. Replica-only nodes:
* - Never host primary shards or promote replicas to primaries
* - Only accept replica shards from indices with {@code index.auto_expand_replicas: 0-all}
* - Do not trigger rebalancing on regular data nodes when joining/leaving the cluster
* - Must be a dedicated role (cannot coexist with any other role)
*/
public static final DiscoveryNodeRole REPLICA_ONLY_ROLE = new DiscoveryNodeRole("replica_only", "ro", true) {

@Override
public Setting<Boolean> legacySetting() {
// replica_only role is added in 3.5 so doesn't need to configure legacy setting
return null;
}

@Override
public void validateRole(List<DiscoveryNodeRole> roles) {
// replica_only role must be the only role on a node (dedicated)
for (DiscoveryNodeRole role : roles) {
if (role.equals(DiscoveryNodeRole.REPLICA_ONLY_ROLE) == false) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"%s role must be the only role on a node. Cannot be combined with: %s",
DiscoveryNodeRole.REPLICA_ONLY_ROLE.roleName(),
role.roleName()
)
);
}
}
}

};

/**
* The built-in node roles.
*/
public static SortedSet<DiscoveryNodeRole> BUILT_IN_ROLES = Collections.unmodifiableSortedSet(
new TreeSet<>(Arrays.asList(DATA_ROLE, INGEST_ROLE, CLUSTER_MANAGER_ROLE, REMOTE_CLUSTER_CLIENT_ROLE, WARM_ROLE))
new TreeSet<>(
Arrays.asList(DATA_ROLE, INGEST_ROLE, CLUSTER_MANAGER_ROLE, REMOTE_CLUSTER_CLIENT_ROLE, WARM_ROLE, REPLICA_ONLY_ROLE)
)
);

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

package org.opensearch.cluster.routing;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.util.CollectionUtil;
import org.opensearch.cluster.ClusterState;
Expand Down Expand Up @@ -85,6 +86,8 @@
*/
@PublicApi(since = "1.0.0")
public class RoutingNodes implements Iterable<RoutingNode> {
private static final Logger logger = LogManager.getLogger(RoutingNodes.class);

private final Metadata metadata;

private final Map<String, RoutingNode> nodesToShards = new HashMap<>();
Expand Down Expand Up @@ -812,6 +815,21 @@ private void promoteReplicaToPrimary(ShardRouting activeReplica, RoutingChangesO
// if the activeReplica was relocating before this call to failShard, its relocation was cancelled earlier when we
// failed initializing replica shards (and moved replica relocation source back to started)
assert activeReplica.started() : "replica relocation should have been cancelled: " + activeReplica;

// CRITICAL: Never promote replicas on replica-only nodes
RoutingNode routingNode = node(activeReplica.currentNodeId());
if (routingNode != null && routingNode.node().isReplicaOnlyNode()) {
logger.warn(
"Cannot promote replica shard [{}] to primary on replica-only node [{}]. "
+ "Shard will remain as replica. Primary must be allocated to a regular data node.",
activeReplica.shardId(),
routingNode.nodeId()
);
// Do NOT call promoteActiveReplicaShardToPrimary - just return
// The primary will remain unassigned, triggering allocation to a data node
return;
}
Comment on lines +818 to +831
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Critical: Early return leaves cluster without a primary shard.

The guard returns early without promoting the replica, but at this point movePrimaryToUnassignedAndDemoteToReplica() has already been called in unassignPrimaryAndPromoteActiveReplicaIfExists(). This leaves the shard group with no primary:

  1. Original primary → demoted to unassigned replica
  2. Active replica on replica-only node → remains a replica (not promoted)
  3. Result: No primary exists for this shard

The replica-only check should occur before the primary is demoted, ideally in unassignPrimaryAndPromoteActiveReplicaIfExists() when selecting the active replica. The replica selection logic should skip replicas on replica-only nodes.

🔎 Suggested approach

Modify the replica selection in unassignPrimaryAndPromoteActiveReplicaIfExists() to exclude replicas on replica-only nodes:

 private void unassignPrimaryAndPromoteActiveReplicaIfExists(
     ShardRouting failedShard,
     UnassignedInfo unassignedInfo,
     RoutingChangesObserver routingChangesObserver
 ) {
     assert failedShard.primary();
     ShardRouting activeReplica = null;
     if (isMigratingToRemoteStore(metadata)) {
         activeReplica = activeReplicaOnRemoteNode(failedShard.shardId());
+        // Skip if on replica-only node
+        if (activeReplica != null && isReplicaOnReplicaOnlyNode(activeReplica)) {
+            activeReplica = null;
+        }
     }
     if (activeReplica == null) {
         if (metadata.isSegmentReplicationEnabled(failedShard.getIndexName())) {
             activeReplica = activeReplicaWithOldestVersion(failedShard.shardId());
         } else {
             activeReplica = activeReplicaWithHighestVersion(failedShard.shardId());
         }
+        // Skip if on replica-only node
+        if (activeReplica != null && isReplicaOnReplicaOnlyNode(activeReplica)) {
+            activeReplica = null;
+        }
     }
     // ... rest of method
 }
+
+private boolean isReplicaOnReplicaOnlyNode(ShardRouting replica) {
+    RoutingNode rn = node(replica.currentNodeId());
+    return rn != null && rn.node().isReplicaOnlyNode();
+}

Alternatively, filter replica-only nodes in activeReplicaWithHighestVersion() and activeReplicaWithOldestVersion().

Committable suggestion skipped: line range outside the PR's diff.


promoteActiveReplicaShardToPrimary(activeReplica);
routingChangesObserver.replicaPromoted(activeReplica);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,15 @@ MoveDecision decideMove(final ShardRouting shardRouting) {
private Map<String, BalancedShardsAllocator.ModelNode> buildModelFromAssigned() {
Map<String, BalancedShardsAllocator.ModelNode> nodes = new HashMap<>();
for (RoutingNode rn : routingNodes) {
// EXCLUDE replica-only nodes from rebalancing calculations
// These nodes are managed solely by auto-expand replica logic
if (rn.node().isReplicaOnlyNode()) {
if (logger.isTraceEnabled()) {
logger.trace("Excluding replica-only node [{}] from rebalancing model", rn.nodeId());
}
continue;
}

BalancedShardsAllocator.ModelNode node = new BalancedShardsAllocator.ModelNode(rn);
nodes.put(rn.nodeId(), node);
for (ShardRouting shard : rn) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.cluster.routing.allocation.decider;

import org.opensearch.cluster.metadata.AutoExpandReplicas;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.node.DiscoveryNode;
import org.opensearch.cluster.routing.RoutingNode;
import org.opensearch.cluster.routing.ShardRouting;
import org.opensearch.cluster.routing.allocation.RoutingAllocation;

/**
* This allocation decider ensures that replica-only nodes follow strict allocation rules:
* <ul>
* <li>Replica-only nodes never host primary shards</li>
* <li>Replica-only nodes only host replica shards from indices with auto_expand_replicas: 0-all</li>
* <li>Primary shards are never promoted on replica-only nodes</li>
* <li>Regular data nodes are not affected by replica-only node presence</li>
* </ul>
*
* @opensearch.internal
*/
public class ReplicaOnlyAllocationDecider extends AllocationDecider {

public static final String NAME = "replica_only";

@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
return canAllocate(shardRouting, node.node(), allocation);
}

@Override
public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
return canAllocate(shardRouting, node.node(), allocation);
}

@Override
public Decision canForceAllocatePrimary(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
// CRITICAL: Never allow primary allocation to replica-only nodes, even with force
if (node.node().isReplicaOnlyNode()) {
return allocation.decision(
Decision.NO,
NAME,
"primary shard [%s] cannot be force allocated to replica-only node [%s]",
shardRouting.shardId(),
node.nodeId()
);
}
return allocation.decision(Decision.YES, NAME, "node is not a replica-only node");
}

@Override
public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNode node, RoutingAllocation allocation) {
if (!node.isReplicaOnlyNode()) {
// Regular data nodes participate in auto-expand for all indices
return allocation.decision(Decision.YES, NAME, "node [%s] is a data node, eligible for auto-expand", node.getId());
}

// Replica-only nodes only participate in 0-all auto-expand
AutoExpandReplicas autoExpandReplicas = AutoExpandReplicas.SETTING.get(indexMetadata.getSettings());
boolean isAutoExpandAll = autoExpandReplicas.isEnabled()
&& autoExpandReplicas.getMaxReplicas() == Integer.MAX_VALUE
&& autoExpandReplicas.toString().startsWith("0-");

if (isAutoExpandAll) {
return allocation.decision(
Decision.YES,
NAME,
"replica-only node [%s] is eligible for auto-expand replicas from index [%s] with auto_expand_replicas: 0-all",
node.getId(),
indexMetadata.getIndex().getName()
);
} else {
return allocation.decision(
Decision.NO,
NAME,
"replica-only node [%s] is not eligible for index [%s] without auto_expand_replicas: 0-all",
node.getId(),
indexMetadata.getIndex().getName()
);
}
}

private Decision canAllocate(ShardRouting shardRouting, DiscoveryNode node, RoutingAllocation allocation) {
boolean isReplicaOnlyNode = node.isReplicaOnlyNode();

// Case 1: Primary shard allocation
if (shardRouting.primary()) {
if (isReplicaOnlyNode) {
return allocation.decision(
Decision.NO,
NAME,
"primary shard [%s] cannot be allocated to replica-only node [%s]",
shardRouting.shardId(),
node.getId()
);
}
// Allow primaries on regular data nodes
return allocation.decision(Decision.YES, NAME, "node [%s] is a data node, can host primary shard", node.getId());
}

// Case 2: Replica shard allocation
IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index());
AutoExpandReplicas autoExpandReplicas = AutoExpandReplicas.SETTING.get(indexMetadata.getSettings());
boolean isAutoExpandAll = autoExpandReplicas.isEnabled()
&& autoExpandReplicas.getMaxReplicas() == Integer.MAX_VALUE
&& autoExpandReplicas.toString().startsWith("0-");

if (isReplicaOnlyNode) {
if (isAutoExpandAll) {
return allocation.decision(
Decision.YES,
NAME,
"replica shard [%s] from auto-expand (0-all) index can be allocated to replica-only node [%s]",
shardRouting.shardId(),
node.getId()
);
} else {
return allocation.decision(
Decision.NO,
NAME,
"replica shard [%s] cannot be allocated to replica-only node [%s] "
+ "because index [%s] does not have auto_expand_replicas: 0-all",
shardRouting.shardId(),
node.getId(),
indexMetadata.getIndex().getName()
);
}
}

// Regular data nodes can host any replica
return allocation.decision(Decision.YES, NAME, "node [%s] is a data node, can host replica shard", node.getId());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,41 @@ public void testRoleNameIsCaseInsensitive() {
assertEquals(roleName.toLowerCase(Locale.ROOT), dynamicRole.roleName());
assertEquals(roleNameAbbreviation.toLowerCase(Locale.ROOT), dynamicRole.roleNameAbbreviation());
}

public void testReplicaOnlyRoleIsDedicated() {
// replica_only role cannot coexist with any other role
final IllegalArgumentException e1 = expectThrows(
IllegalArgumentException.class,
() -> DiscoveryNodeRole.REPLICA_ONLY_ROLE.validateRole(Arrays.asList(DiscoveryNodeRole.REPLICA_ONLY_ROLE, DiscoveryNodeRole.DATA_ROLE))
);
assertThat(e1, hasToString(containsString("replica_only role must be the only role")));

final IllegalArgumentException e2 = expectThrows(
IllegalArgumentException.class,
() -> DiscoveryNodeRole.REPLICA_ONLY_ROLE.validateRole(
Arrays.asList(DiscoveryNodeRole.REPLICA_ONLY_ROLE, DiscoveryNodeRole.CLUSTER_MANAGER_ROLE)
)
);
assertThat(e2, hasToString(containsString("replica_only role must be the only role")));

final IllegalArgumentException e3 = expectThrows(
IllegalArgumentException.class,
() -> DiscoveryNodeRole.REPLICA_ONLY_ROLE.validateRole(Arrays.asList(DiscoveryNodeRole.REPLICA_ONLY_ROLE, DiscoveryNodeRole.INGEST_ROLE))
);
assertThat(e3, hasToString(containsString("replica_only role must be the only role")));

// replica_only role by itself should not throw
DiscoveryNodeRole.REPLICA_ONLY_ROLE.validateRole(Arrays.asList(DiscoveryNodeRole.REPLICA_ONLY_ROLE));
}

public void testReplicaOnlyRoleProperties() {
assertEquals("replica_only", DiscoveryNodeRole.REPLICA_ONLY_ROLE.roleName());
assertEquals("ro", DiscoveryNodeRole.REPLICA_ONLY_ROLE.roleNameAbbreviation());
assertTrue(DiscoveryNodeRole.REPLICA_ONLY_ROLE.canContainData());
assertNull(DiscoveryNodeRole.REPLICA_ONLY_ROLE.legacySetting());
}

public void testReplicaOnlyRoleInBuiltInRoles() {
assertTrue(DiscoveryNodeRole.BUILT_IN_ROLES.contains(DiscoveryNodeRole.REPLICA_ONLY_ROLE));
}
}
Loading
Loading