diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3ca7f1579..b34d6440a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,6 +18,11 @@ Added - New ``config_applied`` variable in ``cartridge.twophase`` module to track clusterwide configuration status. +- Improved failover and leader election logging: + + - Added structured logs explaining why a leader appointment was made or skipped. + - Logs now include replicaset aliases and number of candidates evaluated. + - Control loop logs clearer start and wait states. ------------------------------------------------------------------------------- [2.16.0] - 2025-06-20 diff --git a/cartridge/failover.lua b/cartridge/failover.lua index cdf137e77..e7d7145b6 100644 --- a/cartridge/failover.lua +++ b/cartridge/failover.lua @@ -214,6 +214,16 @@ local function _get_appointments_stateful_mode(client, timeout) return client:longpoll(timeout) end +local function _get_replicaset_alias_by_replicaset_uuid(replicaset_uuid) + local topology_cfg = vars.clusterwide_config:get_readonly('topology') + local replicaset_map = assert(topology_cfg.replicasets) + local replicaset = replicaset_map[replicaset_uuid] + + if replicaset ~= nil and replicaset.alias ~= nil then + return replicaset.alias + end +end + local function describe(uuid) local topology_cfg = vars.clusterwide_config:get_readonly('topology') local servers = assert(topology_cfg.servers) @@ -266,8 +276,10 @@ local function accept_appointments(appointments) changed = true - log.info('Replicaset %s%s: new leader %s, was %s', + local replicaset_alias = _get_replicaset_alias_by_replicaset_uuid(replicaset_uuid) + log.info('Replicaset %s%s%s: new leader %s, previous leader %s', replicaset_uuid, + replicaset_alias ~= nil and '(' .. replicaset_alias .. ')' or '', replicaset_uuid == vars.replicaset_uuid and ' (me)' or '', describe(leader_uuid), describe(current_leader) @@ -357,6 +369,8 @@ local function fencing_healthcheck() -- there is no need to actuate fencing yet if assert(vars.client):check_quorum() then return true + else + log.warn('Quorum NOT OK, checking replicas...') end local topology_cfg = vars.clusterwide_config:get_readonly('topology') @@ -444,6 +458,8 @@ local function synchro_promote() and not vars.failover_suppressed and box.ctl.promote ~= nil then + log.info('Attempting box.ctl.promote()') + local ok, err = pcall(box.ctl.promote) if ok ~= true then log.error('Failed to promote: %s', err or 'unknown') @@ -464,6 +480,8 @@ local function synchro_demote() and box_info.synchro.queue.owner ~= 0 and box_info.synchro.queue.owner == box_info.id and box.ctl.demote ~= nil then + log.info('Attempting box.ctl.demote()') + local ok, err = pcall(box.ctl.demote) if ok ~= true then log.error('Failed to demote: %s', err or 'unknown') @@ -636,6 +654,7 @@ function reconfigure_all(active_leaders) fencing_start() end + local apply_total_start = fiber.clock() local ok, err = FailoverError:pcall(function() vars.failover_trigger_cnt = vars.failover_trigger_cnt + 1 box.cfg({ @@ -678,10 +697,11 @@ function reconfigure_all(active_leaders) return true end) + local apply_total_elapsed = fiber.clock() - apply_total_start if ok then - log.info('Failover step finished') + log.info('Failover step finished in %.6f sec', apply_total_elapsed) else - log.warn('Failover step failed: %s', err) + log.warn('Failover step failed after %.6f sec: %s', apply_total_elapsed, err) end confapplier.set_state('RolesConfigured') end @@ -729,7 +749,7 @@ local function failover_loop(args) local csw1 = utils.fiber_csw() if appointments == nil then - log.warn('%s', err.err) + log.warn('Appointments error: %s', err.err) vars.failover_err = FailoverError:new( "Error fetching appointments: %s", err.err ) diff --git a/cartridge/roles/coordinator.lua b/cartridge/roles/coordinator.lua index 0ad3faef6..83b7d8155 100644 --- a/cartridge/roles/coordinator.lua +++ b/cartridge/roles/coordinator.lua @@ -44,6 +44,17 @@ vars:new('healthcheck', function(_, instance_uuid) return false end) +local DECISION_REASONS = { + IMMUNITY_NOT_EXPIRED = 'immunity_not_expired', + CURRENT_LEADER_HEALTHY = 'current_leader_healthy', + FIRST_APPOINTMENT = 'first_appointment', + NEW_LEADER_SELECTED = 'new_leader_selected', + NO_HEALTHY_CANDIDATES = 'no_healthy_candidates', +} + +local CHECKED_NONE = 0 +local CHECKED_FIRST = 1 + local function pack_decision(leader_uuid) checks('string') return { @@ -53,6 +64,60 @@ local function pack_decision(leader_uuid) } end +local function _get_replicaset_alias_by_replicaset_uuid(replicaset_uuid) + local replicaset_map = assert(vars.topology_cfg.replicasets) + local replicaset = replicaset_map[replicaset_uuid] + + if replicaset ~= nil and replicaset.alias ~= nil then + return replicaset.alias + end +end + +local function describe(uuid) + local servers = assert(vars.topology_cfg.servers) + local srv = servers[uuid] + + if srv ~= nil then + local uri = srv.uri + local alias + + local member = membership.get_member(uri) + if member ~= nil and member.payload ~= nil and member.payload.alias ~= nil then + alias = member.payload.alias + end + + if alias ~= nil and uri ~= nil then + return string.format('%s (%s, %q)', uuid, alias, uri) + elseif uri ~= nil then + return string.format('%s (%q)', uuid, uri) + end + end + + return uuid +end + +--- Make leader election decision for a replicaset. +-- +-- This function evaluates the current leader and available candidates, +-- taking into account immunity timeout and health status. +-- +-- @function make_decision +-- @local +-- @tparam table ctx The failover context table (must include .members and .decisions) +-- @tparam string replicaset_uuid UUID of the replicaset +-- +-- @treturn[1] table decision The decision table with `leader` and `immunity` fields +-- @treturn[1] table info Metadata about the decision: +-- - info.reason (string): One of: +-- * DECISION_REASONS.IMMUNITY_NOT_EXPIRED +-- * DECISION_REASONS.CURRENT_LEADER_HEALTHY +-- * DECISION_REASONS.FIRST_APPOINTMENT +-- * DECISION_REASONS.NEW_LEADER_SELECTED +-- * DECISION_REASONS.NO_HEALTHY_CANDIDATES +-- - info.checked (number): Number of candidates actually checked +-- +-- @treturn[2] nil +-- @treturn[2] table info Same as above local function make_decision(ctx, replicaset_uuid) checks({members = 'table', decisions = 'table'}, 'string') @@ -60,11 +125,17 @@ local function make_decision(ctx, replicaset_uuid) if current_decision ~= nil then local current_leader_uuid = current_decision.leader local current_leader = vars.topology_cfg.servers[current_leader_uuid] - if fiber.clock() < current_decision.immunity - or (topology.electable(current_leader_uuid, current_leader) - and vars.healthcheck(ctx.members, current_decision.leader)) - then - return nil + if topology.electable(current_leader_uuid, current_leader) + and vars.healthcheck(ctx.members, current_leader_uuid) then + return nil, { + reason = DECISION_REASONS.CURRENT_LEADER_HEALTHY, + checked = CHECKED_NONE, + } + elseif fiber.clock() < current_decision.immunity then + return nil, { + reason = DECISION_REASONS.IMMUNITY_NOT_EXPIRED, + checked = CHECKED_NONE, + } end end @@ -78,16 +149,29 @@ local function make_decision(ctx, replicaset_uuid) -- without regard to the healthcheck local decision = pack_decision(candidates[1]) ctx.decisions[replicaset_uuid] = decision - return decision + return decision, { + reason = DECISION_REASONS.FIRST_APPOINTMENT, + checked = CHECKED_FIRST, + } end + local checked_count = 0 for _, instance_uuid in ipairs(candidates) do + checked_count = checked_count + 1 if vars.healthcheck(ctx.members, instance_uuid) then local decision = pack_decision(instance_uuid) ctx.decisions[replicaset_uuid] = decision - return decision + return decision, { + reason = DECISION_REASONS.NEW_LEADER_SELECTED, + checked = checked_count, + } end end + + return nil, { + reason = DECISION_REASONS.NO_HEALTHY_CANDIDATES, + checked = checked_count, + } end local function control_loop(session) @@ -95,18 +179,38 @@ local function control_loop(session) local ctx = assert(session.ctx) while true do + log.info('Making decisions') + ctx.members = membership.members() local updates = {} - for replicaset_uuid, _ in pairs(vars.topology_cfg.replicasets) do - local decision = make_decision(ctx, replicaset_uuid) + for replicaset_uuid, replicaset in pairs(vars.topology_cfg.replicasets) do + local prev = ctx.decisions[replicaset_uuid] + local decision, info = make_decision(ctx, replicaset_uuid) + local prev_leader_uuid = prev ~= nil and prev.leader ~= nil and prev.leader or 'none' + if decision ~= nil then table.insert(updates, {replicaset_uuid, decision.leader}) - log.info('Replicaset %s: appoint %s (%q)', - replicaset_uuid, decision.leader, - vars.topology_cfg.servers[decision.leader].uri + log.info( + 'Replicaset %s%s: appoint new leader %s, previous leader %s (reason=%s, checked=%d)', + replicaset_uuid, + replicaset.alias ~= nil and '(' .. replicaset.alias .. ')' or '', + describe(decision.leader), + describe(prev_leader_uuid), + info.reason, + info.checked ) + else + if info.reason ~= DECISION_REASONS.CURRENT_LEADER_HEALTHY then + log.warn( + 'Replicaset %s%s: no appointment made (reason=%s, checked=%d)', + replicaset_uuid, + replicaset.alias ~= nil and '(' .. replicaset.alias .. ')' or '', + info.reason, + info.checked + ) + end end end @@ -136,6 +240,7 @@ local function control_loop(session) end assert(next_moment >= now) + log.info('Wait membership notifications') vars.membership_notification:wait(next_moment - now) fiber.testcancel() end @@ -376,12 +481,20 @@ local function appoint_leaders(leaders) return nil, AppointmentError:new("Cannot appoint non-electable instance") end + local prev = session.ctx.decisions[replicaset_uuid] + local prev_leader_uuid = prev ~= nil and prev.leader ~= nil and prev.leader or 'none' + + local replicaset_alias = _get_replicaset_alias_by_replicaset_uuid(replicaset_uuid) + log.info( + 'Replicaset %s%s: appoint new leader %s, previous leader %s (manual)', + replicaset_uuid, + replicaset_alias ~= nil and '(' .. replicaset_alias .. ')' or '', + describe(leader_uuid), + describe(prev_leader_uuid) + ) + local decision = pack_decision(leader_uuid) table.insert(updates, {replicaset_uuid, decision.leader}) - log.info('Replicaset %s: appoint %s (%q) (manual)', - replicaset_uuid, decision.leader, - assert(servers[decision.leader]).uri - ) session.ctx.decisions[replicaset_uuid] = decision end