Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ Changed

- Update ``http`` dependency to `https://github.com/tarantool/http/releases/tag/1.8.0>`_.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fixed
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- An **instance** (router or storage) could stick to a minority etcd node after a
network partition, keep an outdated `active_leaders` key, and cause split-brain.
Ordinary reads are now sent with `quorum=true`, and every request (reads, writes,
long-polls) is issued to the next endpoint in a round-robin order.
Split-brain is prevented, and long-polls eventually reach a majority node.

-------------------------------------------------------------------------------
[2.16.1] - 2025-07-04
-------------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion cartridge/etcd2-client.lua
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ end
-- @treturn[2] table Error description
local function check_quorum(client)
local session = client:get_session()
local resp, err = session.connection:request('GET', '/lock?quorum=true')
local resp, err = session.connection:request('GET', '/lock', {quorum=true})
if resp ~= nil then
return true
elseif err.etcd_code == etcd2.EcodeKeyNotFound then
Expand Down
30 changes: 24 additions & 6 deletions cartridge/etcd2.lua
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ local function request(connection, method, path, args, opts)
assert(connection.etcd_cluster_id ~= nil)

local body = {}
if method == 'GET' then
args = args or {}
-- Quorum does not work with wait: it does not wait for the result and returns immediately.
-- If quorum is not provided explicitly, it defaults to true.
if args.wait == nil and args.quorum == nil then
args.quorum = true
end
end

if args ~= nil then
for k, v in pairs(args) do
table.insert(body, k .. '=' .. tostring(v))
Expand Down Expand Up @@ -57,12 +66,11 @@ local function request(connection, method, path, args, opts)
local lasterror
local num_endpoints = #connection.endpoints
assert(num_endpoints > 0)
for _ = 1, num_endpoints do
local eidx = connection.eidx

for i = 0, num_endpoints - 1 do
local eidx = connection.eidx + i
if eidx > num_endpoints then
eidx = eidx % num_endpoints
end
-- round-robin on etcd-connections
connection.eidx = (connection.eidx % num_endpoints) + 1

if #connection.endpoints ~= num_endpoints then
-- something may change during network yield
Expand Down Expand Up @@ -102,7 +110,6 @@ local function request(connection, method, path, args, opts)
goto continue
end

connection.eidx = eidx
local ok, data = pcall(json.decode, resp.body)
if not ok then
-- Example:
Expand Down Expand Up @@ -134,6 +141,17 @@ local function request(connection, method, path, args, opts)
-- x-etcd-cluster-id: cdf818194e3a8c32
-- x-etcd-index: '61529'
-- ...
if (data.errorCode == 300 or data.errorCode == 301) and method == 'GET' then
lasterror = EtcdError:new(
"quorum not ok for %s, %s, %s, %s",
connection.endpoints[eidx],
data.errorCode,
data.message,
data.cause
)
lasterror.http_code = resp.status
goto continue
end

local err = EtcdError:new('%s (%s): %s',
data.message, data.errorCode, data.cause
Expand Down
70 changes: 70 additions & 0 deletions test/integration/etcd2_client_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -578,3 +578,73 @@ function g.test_promote_after_close()

rawset(package.loaded['http.client'], 'request', old_request)
end

function g.test_get_request_with_implicit_quorum()
local client = create_client()
local session = client:get_session()
session.eidx = 1
g.etcd_b.process:kill('STOP') -- etcd lack of quorum

local resp, err = session.connection:request('GET', '/some_key', {quorum=false})
t.assert_equals(resp, nil)
t.assert_equals(err.etcd_code, 100)

client:drop_session()
session = client:get_session()

resp, err = session.connection:request('GET', '/some_key', {quorum=true})
t.assert_equals(resp, nil)
t.assert_equals(err.etcd_code, nil)
t.assert_equals(err.http_code, 408)

client:drop_session()
session = client:get_session()

resp, err = session.connection:request('GET', '/some_key')
t.assert_equals(resp, nil)
t.assert_equals(err.etcd_code, nil)
t.assert_equals(err.http_code, 408)
end

function g.test_round_robin_on_etcd_nodes()
local client = create_client()
local session = client:get_session()
local eidx1 = session.connection.eidx
session.connection:request('GET', '/some_key', {quorum=false})
local eidx2 = session.connection.eidx
t.assert_not_equals(eidx1, eidx2)
end

function g.test_longpolling_lack_of_quorum()
local c1 = create_client():get_session()
local kid = uuid.str()
t.assert_equals(
c1:acquire_lock({uuid = kid, uri = 'localhost:9'}),
true
)
c1:set_leaders({{'A', 'a1'}, {'B', 'b1'}})

local client = create_client()
local function async_longpoll()
local chan = fiber.channel(1)
fiber.new(function()
local ret, err = client:longpoll(0.2)
chan:put({ret, err})
end)
return chan
end

t.assert_equals(client:longpoll(0.2), {A = 'a1', B = 'b1'})

local chan = async_longpoll()
t.assert(c1:set_leaders({{'A', 'a2'}}), true)

t.assert_equals(chan:get(0.2), {{A = 'a2', B = 'b1'}})

t.assert(c1:set_leaders({{'B', 'b2'}}), true)

g.etcd_b.process:kill('STOP') -- etcd lack of quorum

local chan = async_longpoll()
t.assert_equals(chan:get(0.4), {{A = 'a2', B = 'b2'}})
end
Loading