Skip to content

Commit 2d9d1dc

Browse files
committed
feat(add-missing=-files): add missing files
1 parent aa87ded commit 2d9d1dc

File tree

4 files changed

+46
-5
lines changed

4 files changed

+46
-5
lines changed

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/common/helpers.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,15 @@ def apply_url_override(url: str) -> str:
115115
# Local executor - use localhost
116116
task_endpoint_type = task_definition["endpoint_type"]
117117
endpoint_uri = cfg.deployment.endpoints[task_endpoint_type]
118-
endpoint_url = f"http://127.0.0.1:{cfg.deployment.port}{endpoint_uri}"
118+
119+
# Use HAProxy port if multiple_instances is enabled
120+
if cfg.deployment.get("multiple_instances", False):
121+
proxy_config = cfg.execution.get("proxy", {}).get("config", {})
122+
port = proxy_config.get("haproxy_port", 5009)
123+
else:
124+
port = cfg.deployment.port
125+
126+
endpoint_url = f"http://127.0.0.1:{port}{endpoint_uri}"
119127
return endpoint_url
120128

121129

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/executors/slurm/executor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,11 @@ def execute_eval(cfg: DictConfig, dry_run: bool = False) -> str:
124124
# Create HAProxy config file with placeholder IPs only if multiple_instances is true
125125
if cfg.deployment.get("multiple_instances", False):
126126
haproxy_config = _generate_haproxy_config_with_placeholders(cfg)
127+
# Save both template and working config
128+
haproxy_template_path = local_task_subdir / "haproxy.cfg.template"
127129
haproxy_config_path = local_task_subdir / "haproxy.cfg"
130+
with open(haproxy_template_path, "w") as f:
131+
f.write(haproxy_config)
128132
with open(haproxy_config_path, "w") as f:
129133
f.write(haproxy_config)
130134

@@ -1118,6 +1122,8 @@ def _get_proxy_server_srun_command(cfg, remote_task_subdir):
11181122
"""Generate HAProxy proxy server srun command."""
11191123
s = ""
11201124
s += "# HAProxy load balancer\n"
1125+
s += "# Copy template to config file (important for restarts)\n"
1126+
s += f"cp {remote_task_subdir}/haproxy.cfg.template {remote_task_subdir}/haproxy.cfg\n"
11211127
s += "# Replace placeholder IPs with actual node IPs\n"
11221128
s += f"haproxy_config_file={remote_task_subdir}/haproxy.cfg\n"
11231129
s += 'for i in "${!NODES_IPS_ARRAY[@]}"; do\n'

packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/executors/slurm/haproxy.cfg.j2

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ defaults
66
log global
77
mode http
88
option httplog
9-
timeout connect 5s
10-
timeout client 30s
11-
timeout server 30s
9+
timeout connect 10s
10+
timeout client 3600s
11+
timeout server 3600s
1212

1313
frontend service_frontend
1414
bind *:{{ haproxy_port }}
@@ -18,7 +18,8 @@ backend service_backend
1818
mode http
1919
option httpchk GET {{ health_check_path }}
2020
http-check expect status {{ health_check_status }}
21-
balance roundrobin
21+
option http-server-close
22+
balance leastconn
2223
{% for node in nodes %}
2324
server node{{ loop.index }} {{ node.ip }}:{{ node.port }} check
2425
{% endfor %}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
global
2+
log stdout format raw local0
3+
maxconn 4096
4+
5+
defaults
6+
log global
7+
mode http
8+
option httplog
9+
timeout connect 10s
10+
timeout client 3600s
11+
timeout server 3600s
12+
13+
frontend service_frontend
14+
bind *:{{ haproxy_port }}
15+
default_backend service_backend
16+
17+
backend service_backend
18+
mode http
19+
option httpchk GET {{ health_check_path }}
20+
http-check expect status {{ health_check_status }}
21+
option http-server-close
22+
balance leastconn
23+
{% for node in nodes %}
24+
server node{{ loop.index }} {{ node.ip }}:{{ node.port }} check
25+
{% endfor %}
26+

0 commit comments

Comments
 (0)