From 6f5e1e9b0c9a1246c44ff83207b100249fcbba97 Mon Sep 17 00:00:00 2001 From: Basha Mougamadou Date: Fri, 18 Oct 2024 14:23:19 +0200 Subject: [PATCH] Add retry on consul lock if failures Since consul lock is based on consul infrastructure, we can hit issues when consul is restarting while locking choregraphie on consul. Indeed, we had cases where consul on leader node took times to restart causing Diplomat call failure. --- libraries/primitive_consul_lock.rb | 33 ++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/libraries/primitive_consul_lock.rb b/libraries/primitive_consul_lock.rb index 530d664..8e74199 100644 --- a/libraries/primitive_consul_lock.rb +++ b/libraries/primitive_consul_lock.rb @@ -122,6 +122,11 @@ def self.get_or_create(path, concurrency:, **kwargs) current_lock = begin Chef::Log.info "Fetch lock state for #{path}" Diplomat::Kv.get(path, decode_values: true, dc: dc, token: dc) + rescue Faraday::ConnectionFailed => e + retry_secs = 30 + Chef::Log.info "Consul did not respond, wait #{retry_secs} seconds and retry to let it (re)start: #{e}" + sleep retry_secs + (retry_left -= 1).positive? ? retry : raise rescue Diplomat::KeyNotFound Chef::Log.info "Lock for #{path} did not exist, creating with value #{value}" Diplomat::Kv.put(path, value.to_json, cas: 0, dc: dc, token: token) # we ignore success/failure of CaS @@ -170,9 +175,17 @@ def enter(opts) if can_enter_lock?(opts) enter_lock(opts) require 'diplomat' - result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) - Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result - result + retry_left = 5 + begin + result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) + Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result + result + rescue Faraday::ConnectionFailed => e + retry_secs = 30 + Chef::Log.info "Consul did not respond, wait #{retry_secs} seconds and retry to let it (re)start: #{e}" + sleep retry_secs + (retry_left -= 1).positive? ? retry : raise + end else Chef::Log.debug("Too many lock holders (concurrency:#{concurrency})") false @@ -188,9 +201,17 @@ def exit(opts) if already_entered?(opts) exit_lock(opts) require 'diplomat' - result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) - Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result - result + retry_left = 5 + begin + result = Diplomat::Kv.put(@path, to_json, cas: @cas, dc: @dc, token: @token) + Chef::Log.debug('Someone updated the lock at the same time, will retry') unless result + result + rescue Faraday::ConnectionFailed => e + retry_secs = 30 + Chef::Log.info "Consul did not respond, wait #{retry_secs} seconds and retry to let it (re)start: #{e}" + sleep retry_secs + (retry_left -= 1).positive? ? retry : raise + end else true end