Skip to content

Commit c5e864b

Browse files
demartinoframauri-melato
authored andcommitted
nodewatcher: fix terminate if down
Faulty nodes need to be replaced also if min size is reached Signed-off-by: Francesco De Martino <[email protected]>
1 parent a636397 commit c5e864b

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

src/nodewatcher/nodewatcher.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def _dump_logs(instance_id):
201201
log.warning("Failed while dumping logs to %s with exception %s.", filename, e)
202202

203203

204-
def _terminate_if_down(scheduler_module, config, instance_id, max_wait):
204+
def _terminate_if_down(scheduler_module, config, asg_name, instance_id, max_wait):
205205
"""Check that node is correctly attached to scheduler otherwise terminate the instance."""
206206
asg_client = boto3.client("autoscaling", region_name=config.region, config=config.proxy_config)
207207

@@ -219,7 +219,7 @@ def _poll_wait_for_node_ready():
219219
_dump_logs(instance_id)
220220
# jobwatcher already has the logic to request a new host in case of down nodes,
221221
# which is done in order to speed up cluster recovery.
222-
_self_terminate(asg_client, instance_id, decrement_desired=True)
222+
_self_terminate(asg_client, instance_id, decrement_desired=not _maintain_size(asg_name, asg_client))
223223

224224

225225
@retry(
@@ -346,7 +346,7 @@ def _poll_instance_status(config, scheduler_module, asg_name, hostname, instance
346346
:param instance_type: current instance type
347347
"""
348348
_wait_for_stack_ready(config.stack_name, config.region, config.proxy_config)
349-
_terminate_if_down(scheduler_module, config, instance_id, INITIAL_TERMINATE_TIMEOUT)
349+
_terminate_if_down(scheduler_module, config, asg_name, instance_id, INITIAL_TERMINATE_TIMEOUT)
350350

351351
idletime = _init_idletime()
352352
instance_properties = get_instance_properties(config.region, config.proxy_config, instance_type)
@@ -358,7 +358,7 @@ def _poll_instance_status(config, scheduler_module, asg_name, hostname, instance
358358
max_cluster_size = _refresh_cluster_properties(config.region, config.proxy_config, asg_name)
359359

360360
_store_idletime(idletime)
361-
_terminate_if_down(scheduler_module, config, instance_id, TERMINATE_TIMEOUT)
361+
_terminate_if_down(scheduler_module, config, asg_name, instance_id, TERMINATE_TIMEOUT)
362362

363363
has_jobs = _has_jobs(scheduler_module, hostname)
364364
if has_jobs:

0 commit comments

Comments
 (0)