Skip to content

Commit 31d168a

Browse files
authored
Fix test_error_handling integration test
Allow test_error_handling integration test to wait longer for a compute node to be terminated by computemgtd Extend wait_for_compute_nodes_states to wait for a custom amount of time Signed-off-by: Jacopo De Amicis <[email protected]>
1 parent 485a2a4 commit 31d168a

File tree

2 files changed

+42
-8
lines changed

2 files changed

+42
-8
lines changed

tests/integration-tests/tests/common/hit_common.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,16 @@ def assert_compute_node_states(scheduler_commands, compute_nodes, expected_state
6767
assert_that(expected_states).contains(node_states.get(node))
6868

6969

70-
@retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))
71-
def wait_for_compute_nodes_states(scheduler_commands, compute_nodes, expected_states):
72-
assert_compute_node_states(scheduler_commands, compute_nodes, expected_states)
70+
def wait_for_compute_nodes_states(
71+
scheduler_commands,
72+
compute_nodes,
73+
expected_states,
74+
wait_fixed_secs=20,
75+
stop_max_delay_secs=300,
76+
):
77+
retry(wait_fixed=seconds(wait_fixed_secs), stop_max_delay=seconds(stop_max_delay_secs))(assert_compute_node_states)(
78+
scheduler_commands, compute_nodes, expected_states
79+
)
7380

7481

7582
def assert_compute_node_reasons(scheduler_commands, compute_nodes, expected_reason):

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,12 @@ def _test_ec2_status_check_replacement(
876876
)
877877
scheduler_commands.cancel_job(kill_job_id)
878878
# Assert static nodes are reset
879-
_wait_for_node_reset(scheduler_commands, static_nodes=static_nodes, dynamic_nodes=[])
879+
_wait_for_node_reset(
880+
scheduler_commands,
881+
static_nodes=static_nodes,
882+
dynamic_nodes=[],
883+
stop_max_delay_secs=1200,
884+
)
880885
assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
881886
# Reset SlurmdTimeout to 180s
882887
_set_slurmd_timeout(remote_command_executor, slurm_root_path, timeout=180)
@@ -950,20 +955,42 @@ def _test_clustermgtd_down_logic(
950955
)
951956

952957

953-
def _wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes):
958+
def _wait_for_node_reset(
959+
scheduler_commands,
960+
static_nodes,
961+
dynamic_nodes,
962+
wait_fixed_secs=20,
963+
stop_max_delay_secs=300,
964+
):
954965
"""Wait for static and dynamic nodes to be reset."""
955966
if static_nodes:
956967
logging.info("Assert static nodes are placed in DOWN during replacement")
957968
# DRAIN+DOWN = drained
958969
wait_for_compute_nodes_states(
959-
scheduler_commands, static_nodes, expected_states=["down", "down*", "drained", "drained*"]
970+
scheduler_commands,
971+
static_nodes,
972+
expected_states=["down", "down*", "drained", "drained*"],
973+
wait_fixed_secs=wait_fixed_secs,
974+
stop_max_delay_secs=stop_max_delay_secs,
960975
)
961976
logging.info("Assert static nodes are replaced")
962-
wait_for_compute_nodes_states(scheduler_commands, static_nodes, expected_states=["idle"])
977+
wait_for_compute_nodes_states(
978+
scheduler_commands,
979+
static_nodes,
980+
expected_states=["idle"],
981+
wait_fixed_secs=wait_fixed_secs,
982+
stop_max_delay_secs=stop_max_delay_secs,
983+
)
963984
# dynamic nodes are power saved after SuspendTimeout. static_nodes must be checked first
964985
if dynamic_nodes:
965986
logging.info("Assert dynamic nodes are power saved")
966-
wait_for_compute_nodes_states(scheduler_commands, dynamic_nodes, expected_states=["idle~"])
987+
wait_for_compute_nodes_states(
988+
scheduler_commands,
989+
dynamic_nodes,
990+
expected_states=["idle~"],
991+
wait_fixed_secs=wait_fixed_secs,
992+
stop_max_delay_secs=stop_max_delay_secs,
993+
)
967994
node_addr_host = scheduler_commands.get_node_addr_host()
968995
_assert_node_addr_host_reset(node_addr_host, dynamic_nodes)
969996

0 commit comments

Comments
 (0)