Skip to content

Commit f3f02d7

Browse files
committed
fix(missing-template): fix missing template
Signed-off-by: Anna Warno <[email protected]>
1 parent 3674487 commit f3f02d7

File tree

1 file changed

+15
-5
lines changed
  • packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/executors/slurm

1 file changed

+15
-5
lines changed

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/executors/slurm/executor.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -567,11 +567,11 @@ def _create_slurm_sbatch_script(
567567

568568
# wait for the server to initialize
569569
health_path = cfg.deployment.get("health_check_path", "/health")
570-
# Only check MASTER_IP if not multiinstance, otherwise check all IPs
570+
# For multi-instance check all node IPs, for single instance check localhost
571571
if cfg.deployment.get("multiple_instances", False):
572572
ip_list = '"${NODES_IPS_ARRAY[@]}"'
573573
else:
574-
ip_list = '"$MASTER_IP"'
574+
ip_list = '"127.0.0.1"'
575575
s += _get_wait_for_server_handler(
576576
ip_list,
577577
cfg.deployment.port,
@@ -1097,8 +1097,13 @@ def _generate_haproxy_config_with_placeholders(cfg):
10971097
"""Generate HAProxy configuration with placeholder IPs using Jinja template."""
10981098
# Set up Jinja environment
10991099
template_dir = Path(__file__).parent
1100+
template_path = template_dir / "haproxy.cfg.template"
1101+
1102+
if not template_path.exists():
1103+
raise FileNotFoundError(f"HAProxy template not found: {template_path}")
1104+
11001105
env = Environment(loader=FileSystemLoader(template_dir))
1101-
template = env.get_template("haproxy.cfg.j2")
1106+
template = env.get_template("haproxy.cfg.template")
11021107

11031108
# Prepare template data with placeholder IPs - use actual number of nodes
11041109
num_nodes = cfg.execution.num_nodes
@@ -1127,8 +1132,13 @@ def _generate_haproxy_config(cfg, nodes_ips):
11271132
"""Generate HAProxy configuration using Jinja template."""
11281133
# Set up Jinja environment
11291134
template_dir = Path(__file__).parent
1135+
template_path = template_dir / "haproxy.cfg.template"
1136+
1137+
if not template_path.exists():
1138+
raise FileNotFoundError(f"HAProxy template not found: {template_path}")
1139+
11301140
env = Environment(loader=FileSystemLoader(template_dir))
1131-
template = env.get_template("haproxy.cfg.j2")
1141+
template = env.get_template("haproxy.cfg.template")
11321142

11331143
# Prepare template data
11341144
nodes = []
@@ -1229,7 +1239,7 @@ def _get_wait_for_server_handler(
12291239

12301240

12311241
def _get_proxy_server_srun_command(cfg, remote_task_subdir):
1232-
"""Generate HAProxy proxy server srun command."""
1242+
"""Generate HAProxy proxy server srun command using template-based config."""
12331243
s = ""
12341244
s += "# HAProxy load balancer\n"
12351245
s += "# Copy template to config file (important for restarts)\n"

0 commit comments

Comments
 (0)