@@ -257,6 +257,9 @@ def test_slurm_scaling(
257257 cluster = clusters_factory (cluster_config )
258258 remote_command_executor = RemoteCommandExecutor (cluster )
259259 scheduler_commands = scheduler_commands_factory (remote_command_executor )
260+ # TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
261+ # We must address it and restore the default wait time to 300s.
262+ stop_max_delay_secs = 400 if (os .startswith ("rocky" ) or os .startswith ("rhel" )) else 300
260263
261264 _assert_cluster_initial_conditions (scheduler_commands , 20 , 20 , 4 )
262265 _test_online_node_configured_correctly (
@@ -284,18 +287,19 @@ def test_slurm_scaling(
284287 num_static_nodes = 2 ,
285288 num_dynamic_nodes = 3 ,
286289 dynamic_instance_type = instance ,
290+ stop_max_delay_secs = stop_max_delay_secs ,
287291 )
288292 _test_replace_down_nodes (
289293 remote_command_executor ,
290294 scheduler_commands ,
291295 test_datadir ,
292296 cluster .cfn_name ,
293297 region ,
294- os ,
295298 partition = "ondemand1" ,
296299 num_static_nodes = 2 ,
297300 num_dynamic_nodes = 3 ,
298301 dynamic_instance_type = instance ,
302+ stop_max_delay_secs = stop_max_delay_secs ,
299303 )
300304 _test_keep_or_replace_suspended_nodes (
301305 scheduler_commands ,
@@ -305,6 +309,7 @@ def test_slurm_scaling(
305309 num_static_nodes = 2 ,
306310 num_dynamic_nodes = 3 ,
307311 dynamic_instance_type = instance ,
312+ stop_max_delay_secs = stop_max_delay_secs ,
308313 )
309314 assert_no_errors_in_logs (remote_command_executor , scheduler )
310315
@@ -1139,7 +1144,14 @@ def _test_partition_states(
11391144
11401145
11411146def _test_reset_terminated_nodes (
1142- scheduler_commands , cluster_name , region , partition , num_static_nodes , num_dynamic_nodes , dynamic_instance_type
1147+ scheduler_commands ,
1148+ cluster_name ,
1149+ region ,
1150+ partition ,
1151+ num_static_nodes ,
1152+ num_dynamic_nodes ,
1153+ dynamic_instance_type ,
1154+ stop_max_delay_secs ,
11431155):
11441156 """
11451157 Test that slurm nodes are reset if instances are terminated manually.
@@ -1162,7 +1174,7 @@ def _test_reset_terminated_nodes(
11621174 # terminate all instances manually
11631175 _terminate_nodes_manually (instance_ids , region )
11641176 # Assert that cluster replaced static node and reset dynamic nodes
1165- _wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes )
1177+ _wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes , stop_max_delay_secs = stop_max_delay_secs )
11661178 assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
11671179
11681180
@@ -1172,11 +1184,11 @@ def _test_replace_down_nodes(
11721184 test_datadir ,
11731185 cluster_name ,
11741186 region ,
1175- os ,
11761187 partition ,
11771188 num_static_nodes ,
11781189 num_dynamic_nodes ,
11791190 dynamic_instance_type ,
1191+ stop_max_delay_secs ,
11801192):
11811193 """Test that slurm nodes are replaced if nodes are marked DOWN."""
11821194 logging .info ("Testing that nodes replaced when set to down state" )
@@ -1196,22 +1208,28 @@ def _test_replace_down_nodes(
11961208 remote_command_executor .run_remote_script (str (test_datadir / "slurm_kill_slurmd_job.sh" ), args = [node ])
11971209 # set dynamic to down manually
11981210 _set_nodes_to_down_manually (scheduler_commands , dynamic_nodes )
1199- # TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
1200- # We must address it and restore the default wait time to 300s.
1201- stop_max_delay_secs = 360 if os .startswith ("rocky" ) else 300
12021211 _wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes , stop_max_delay_secs = stop_max_delay_secs )
12031212 assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
12041213
12051214
12061215def _test_keep_or_replace_suspended_nodes (
1207- scheduler_commands , cluster_name , region , partition , num_static_nodes , num_dynamic_nodes , dynamic_instance_type
1216+ scheduler_commands ,
1217+ cluster_name ,
1218+ region ,
1219+ partition ,
1220+ num_static_nodes ,
1221+ num_dynamic_nodes ,
1222+ dynamic_instance_type ,
1223+ stop_max_delay_secs ,
12081224):
12091225 """Test keep DRAIN nodes if there is job running, or terminate if no job is running."""
12101226 logging .info (
12111227 "Testing that nodes are NOT terminated when set to suspend state and there is job running on the nodes"
12121228 )
12131229 job_id = submit_initial_job (
12141230 scheduler_commands ,
1231+ # Job running time should at least bigger than `_wait_for_node_reset` timeout
1232+ # plus `_assert_nodes_not_terminated` time
12151233 "sleep 550" ,
12161234 partition ,
12171235 dynamic_instance_type ,
@@ -1224,13 +1242,17 @@ def _test_keep_or_replace_suspended_nodes(
12241242 # Set all nodes to drain, static should be in DRAINED and dynamic in DRAINING
12251243 _set_nodes_to_suspend_state_manually (scheduler_commands , static_nodes + dynamic_nodes )
12261244 # Static nodes in DRAINED are immediately replaced
1227- _wait_for_node_reset (scheduler_commands , static_nodes = static_nodes , dynamic_nodes = [])
1245+ _wait_for_node_reset (
1246+ scheduler_commands , static_nodes = static_nodes , dynamic_nodes = [], stop_max_delay_secs = stop_max_delay_secs
1247+ )
12281248 # Assert dynamic nodes in DRAINING are not terminated during job run
12291249 _assert_nodes_not_terminated (scheduler_commands , dynamic_nodes )
12301250 # wait until the job is completed and check that the DRAINING dynamic nodes are then terminated
12311251 scheduler_commands .wait_job_completed (job_id )
12321252 scheduler_commands .assert_job_succeeded (job_id )
1233- _wait_for_node_reset (scheduler_commands , static_nodes = [], dynamic_nodes = dynamic_nodes )
1253+ _wait_for_node_reset (
1254+ scheduler_commands , static_nodes = [], dynamic_nodes = dynamic_nodes , stop_max_delay_secs = stop_max_delay_secs
1255+ )
12341256 assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
12351257
12361258
@@ -1415,6 +1437,8 @@ def _wait_for_node_reset(
14151437 wait_fixed_secs = wait_fixed_secs ,
14161438 stop_max_delay_secs = stop_max_delay_secs ,
14171439 )
1440+ # Add delay to accommodate node replacement process (~45s between node down status and replacement)
1441+ time .sleep (45 )
14181442 logging .info ("Assert static nodes are replaced" )
14191443 wait_for_compute_nodes_states (
14201444 scheduler_commands ,
@@ -1443,10 +1467,10 @@ def _assert_node_addr_host_reset(addr_host_list, nodes):
14431467 assert_that (addr_host_list ).contains ("{0} {0} {0}" .format (nodename ))
14441468
14451469
1446- def _assert_nodes_not_terminated (scheduler_commands , nodes , timeout = 5 ):
1447- logging .info ("Waiting for cluster daemon action" )
1470+ def _assert_nodes_not_terminated (scheduler_commands , nodes , waiting_time = 2 ):
1471+ logging .info ("Assert the job still running for {} minutes on DRAINING dynamic nodes." . format ( waiting_time ) )
14481472 start_time = time .time ()
1449- while time .time () < start_time + 60 * (timeout ):
1473+ while time .time () < start_time + 60 * (waiting_time ):
14501474 assert_that (set (nodes ) <= set (scheduler_commands .get_compute_nodes ())).is_true ()
14511475 time .sleep (20 )
14521476
0 commit comments