Skip to content

Commit b2947e7

Browse files
authored
Merge pull request #2082 from fonta-rh/OCPBUGS-59238-fix-active-resource-count
OCPBUGS-59238: podman-etcd: Redo counting of active_resources to avoid bug on rapid etcd restart
2 parents 45491e8 + 0114ddf commit b2947e7

File tree

1 file changed

+46
-2
lines changed

1 file changed

+46
-2
lines changed

heartbeat/podman-etcd

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,50 @@ get_peer_node_name() {
11201120
crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
11211121
}
11221122

1123+
# Calculate the count of truly active resources by excluding those being stopped.
1124+
# According to Pacemaker documentation, during "Post-notification (stop) /
1125+
# Pre-notification (start)" transitions, the true active resource count should be:
1126+
# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
1127+
# minus $OCF_RESKEY_CRM_meta_notify_stop_resource
1128+
# This handles the case where a resource appears in both the active and stop lists
1129+
# during rapid restart scenarios (e.g., process crash recovery).
1130+
get_truly_active_resources_count() {
1131+
local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
1132+
local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
1133+
local truly_active=""
1134+
1135+
# If no active resources, return 0
1136+
# Use word count to handle whitespace-only values
1137+
if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
1138+
echo "0"
1139+
return
1140+
fi
1141+
1142+
# If no resources being stopped, return count of active resources
1143+
# Use word count to handle whitespace-only values
1144+
if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
1145+
echo "$active_list" | wc -w
1146+
return
1147+
fi
1148+
1149+
# Filter out resources that are being stopped from the active list
1150+
for resource in $active_list; do
1151+
local is_stopping=0
1152+
for stop_resource in $stop_list; do
1153+
if [ "$resource" = "$stop_resource" ]; then
1154+
is_stopping=1
1155+
break
1156+
fi
1157+
done
1158+
if [ $is_stopping -eq 0 ]; then
1159+
truly_active="$truly_active $resource"
1160+
fi
1161+
done
1162+
1163+
# Count the truly active resources (trim leading space and count words)
1164+
echo "$truly_active" | wc -w
1165+
}
1166+
11231167
get_all_etcd_endpoints() {
11241168
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
11251169
name=$(echo "$node" | cut -d: -f1)
@@ -1739,8 +1783,8 @@ podman_start()
17391783
# - 0 active agents, 1 starting: we are starting; the peer is not starting
17401784
# - 0 active agents, 2 starting: both agents are starting simultaneously
17411785
local active_resources_count
1742-
active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
1743-
ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
1786+
active_resources_count=$(get_truly_active_resources_count)
1787+
ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
17441788
case "$active_resources_count" in
17451789
1)
17461790
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then

0 commit comments

Comments
 (0)