Skip to content

Commit 3674487

Browse files
committed
fix(multinode-deployment-health): fix multinode deployment health
Signed-off-by: Anna Warno <[email protected]>
1 parent f74f6c4 commit 3674487

File tree

2 files changed

+6
-26
lines changed

2 files changed

+6
-26
lines changed

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/executors/slurm/executor.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,8 +567,13 @@ def _create_slurm_sbatch_script(
567567

568568
# wait for the server to initialize
569569
health_path = cfg.deployment.get("health_check_path", "/health")
570+
# Only check MASTER_IP if not multiinstance, otherwise check all IPs
571+
if cfg.deployment.get("multiple_instances", False):
572+
ip_list = '"${NODES_IPS_ARRAY[@]}"'
573+
else:
574+
ip_list = '"$MASTER_IP"'
570575
s += _get_wait_for_server_handler(
571-
'"${NODES_IPS_ARRAY[@]}"',
576+
ip_list,
572577
cfg.deployment.port,
573578
health_path,
574579
"server",

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/executors/slurm/haproxy.cfg.j2

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)