diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5ee2f3167..812274429 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,13 @@ and this project adheres to Unreleased ------------------------------------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Fixed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Invalid leader appointment in Raft failover when there are not enough instances + in the replicaset. + ------------------------------------------------------------------------------- [2.15.3] - 2025-04-24 ------------------------------------------------------------------------------- diff --git a/cartridge/failover.lua b/cartridge/failover.lua index 59c34be9e..eb020ecf1 100644 --- a/cartridge/failover.lua +++ b/cartridge/failover.lua @@ -958,7 +958,6 @@ local function cfg(clusterwide_config, opts) and #topology.get_leaders_order( topology_cfg, vars.replicaset_uuid, nil, {only_electable = false, only_enabled = true}) < 3 then - first_appointments = _get_appointments_disabled_mode(topology_cfg) log.warn('Not enough instances to enable Raft failover') raft_failover.disable() else @@ -966,10 +965,10 @@ local function cfg(clusterwide_config, opts) if not ok then return nil, err end - first_appointments = raft_failover.get_appointments(topology_cfg) log.info('Raft failover enabled') end + first_appointments = raft_failover.get_appointments(topology_cfg) vars.failover_fiber = fiber.new(failover_loop, { get_appointments = function() vars.membership_notification:wait() diff --git a/cartridge/failover/raft.lua b/cartridge/failover/raft.lua index a1b4570e4..0c42a0cd2 100644 --- a/cartridge/failover/raft.lua +++ b/cartridge/failover/raft.lua @@ -58,7 +58,22 @@ local function get_appointments(topology_cfg) ::next_rs:: end - appointments[vars.replicaset_uuid] = vars.leader_uuid + -- If Raft is enabled but `disable_raft_on_small_clusters` is active, + -- and the replicaset has fewer than 3 instances, + -- Raft failover is considered disabled for this replicaset. + -- In this case, we appoint a leader as in 'disabled' failover mode. + for replicaset_uuid, _ in pairs(replicasets) do + local leaders = topology.get_leaders_order( + topology_cfg, replicaset_uuid, nil, {only_electable = false, only_enabled = true} + ) + if vars.disable_raft_on_small_clusters and #leaders < 3 then + appointments[replicaset_uuid] = leaders[1] + end + end + + if vars.leader_uuid ~= nil then + appointments[vars.replicaset_uuid] = vars.leader_uuid + end return appointments end diff --git a/test/entrypoint/srv_basic.lua b/test/entrypoint/srv_basic.lua index f973c6627..af323c9af 100755 --- a/test/entrypoint/srv_basic.lua +++ b/test/entrypoint/srv_basic.lua @@ -32,6 +32,7 @@ package.preload['mymodule'] = function() local httpd = service_registry.get('httpd') local validated = {} local master_switches = {} + local leaders_history = {} if httpd ~= nil then httpd:route( @@ -72,6 +73,7 @@ package.preload['mymodule'] = function() get_state = function() return state end, is_master = function() return master end, get_master_switches = function() return master_switches end, + get_leaders_history = function() return leaders_history end, validate_config = function() table.insert(validated, true) return true @@ -99,6 +101,8 @@ package.preload['mymodule'] = function() end master = opts.is_master table.remove(validated, #validated) + local failover = require('cartridge.failover') + table.insert(leaders_history, failover.get_active_leaders()) end, stop = function() state = 'stopped' diff --git a/test/integration/raft_test.lua b/test/integration/raft_test.lua index 5fd646006..2dcd1e275 100644 --- a/test/integration/raft_test.lua +++ b/test/integration/raft_test.lua @@ -8,6 +8,7 @@ local g_not_enough_instances = t.group('integration.raft_not_enough_instances') local g_unelectable = t.group('integration.raft_unelectable') local g_disable = t.group('integration.raft_disabled_instances') local g_expel = t.group('integration.raft_expelled_instances') +local g_appointments = t.group('integration.raft_appointments') local g_all_rw = t.group('integration.raft_all_rw') local h = require('test.helper') @@ -341,6 +342,7 @@ g.test_kill_master = function() start_server('storage-1') start_server('storage-2') + g.cluster:wait_until_healthy() local after_2pc = get_2pc_count() @@ -794,3 +796,133 @@ g_all_rw.test_raft_in_all_rw_mode_fails = function() return require('cartridge.confapplier').get_state() end), 'RolesConfigured') end + +---------------------------------------------------------------- + +setup_group(g_appointments, { + { + alias = 'router', + uuid = h.uuid('a'), + roles = { + 'vshard-router', + 'myrole', + }, + servers = 1, + }, + { + alias = 'storage', + uuid = replicaset_uuid, + roles = { + 'vshard-storage', + }, + servers = { + { + instance_uuid = storage_1_uuid, + }, + { + instance_uuid = storage_2_uuid, + }, + { + instance_uuid = storage_3_uuid, + }, + } + }, +}) + +g_appointments.before_each(function() + t.assert_equals(set_failover_params(g_appointments, { mode = 'raft' }), { mode = 'raft' }) +end) + +g_appointments.after_each(function() + -- return original leader + g_appointments.cluster:retrying({}, function() + g_appointments.cluster:server('storage-1'):call('box.ctl.promote') + end) + + h.retrying({}, function() + t.assert_equals(g_appointments.cluster:server('storage-1'):eval(q_leadership), storage_1_uuid) + end) +end) + +g_appointments.test_leader_persists_after_config_apply = function() + -- There was a bug with leader selection when Raft failover was enabled + -- on replicasets with fewer than 3 instances. + -- Raft is disabled on such replicasets. These were often routers. + -- As a result, the wrong storages were elected as masters, + -- and vshard returned a NON_MASTER error. + -- The bug is reproduced as follows: + -- 1. Change the leader in the replica set + -- 2. After the leader change, the router learns about the new leader via membership + -- 3. Apply a config update, which triggers failover.cfg + -- 4. The router incorrectly determines the leader and sets the old master as the leader, + -- because it appears first in the topology (lexicographically), as in 'disabled' mode. + + -- Step 1: change the leader in the replica set + g_appointments.cluster:retrying({}, function() + g_appointments.cluster:server('storage-2'):call('box.ctl.promote') + end) + + local storage_rs_uuid = g_appointments.cluster:server('storage-1').replicaset_uuid + + local leader_switched_index + -- Step 2: wait for the router to update the leader via membership after the master change + h.retrying({}, function() + local res = g_appointments.cluster:server('router-1'):exec(function() + local cartridge = require('cartridge') + return cartridge.service_get('myrole').get_leaders_history() + end) + + leader_switched_index = #res + t.assert_equals(res[leader_switched_index][storage_rs_uuid], storage_2_uuid) + end) + + -- Step 3: Apply new config to simulate a configuration change and trigger apply_config + g_appointments.cluster:server('router-1'):exec(function() + return require("cartridge").config_patch_clusterwide({uuid = require("uuid").str()}) + end) + + h.wish_state(g_appointments.cluster:server('router-1'), 'RolesConfigured', 10) + + -- Step 4: Check the updated leaders on the router + -- Ensure the leader did not revert from storage-2 back to storage-1 + h.retrying({}, function() + local res = g_appointments.cluster:server('router-1'):exec(function() + local cartridge = require('cartridge') + return cartridge.service_get('myrole').get_leaders_history() + end) + + t.assert(#res > leader_switched_index, 'Wait for failover.cfg to be called again after config apply') + for i = leader_switched_index, #res do + local leader_list = res[i] + -- After the switch, storage-2 must remain the leader + t.assert_equals(leader_list[storage_rs_uuid], storage_2_uuid) + end + end) +end + +g_appointments.test_on_small_replicaset = function() + -- Check that if there are fewer than 3 instances in the replicaset, + -- the router behaves as in 'disabled' failover mode and appoints + -- the first (lexicographically) enabled instance as leader. + + -- change master, + g_appointments.cluster:retrying({}, function() + g_appointments.cluster:server('storage-2'):call('box.ctl.promote') + end) + + h.retrying({}, function() + t.assert_equals(g_appointments.cluster:server('router-1'):eval(q_leadership), storage_2_uuid) + end) + + -- Disable one instance to simulate a small replicaset (<3 instances) + g_appointments.cluster.main_server:exec(function(uuid) + require('cartridge.lua-api.topology').disable_servers({uuid}) + end, {storage_2_uuid}) + + h.wish_state(g_appointments.cluster:server('router-1'), 'RolesConfigured', 10) + + h.retrying({}, function() + -- storage-1 should be appointed as leader on router, as in 'disabled' mode + t.assert_equals(g_appointments.cluster:server('router-1'):eval(q_leadership), storage_1_uuid) + end) +end