@@ -876,7 +876,12 @@ def _test_ec2_status_check_replacement(
876876 )
877877 scheduler_commands .cancel_job (kill_job_id )
878878 # Assert static nodes are reset
879- _wait_for_node_reset (scheduler_commands , static_nodes = static_nodes , dynamic_nodes = [])
879+ _wait_for_node_reset (
880+ scheduler_commands ,
881+ static_nodes = static_nodes ,
882+ dynamic_nodes = [],
883+ stop_max_delay_secs = 1200 ,
884+ )
880885 assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
881886 # Reset SlurmdTimeout to 180s
882887 _set_slurmd_timeout (remote_command_executor , slurm_root_path , timeout = 180 )
@@ -950,20 +955,42 @@ def _test_clustermgtd_down_logic(
950955 )
951956
952957
953- def _wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes ):
958+ def _wait_for_node_reset (
959+ scheduler_commands ,
960+ static_nodes ,
961+ dynamic_nodes ,
962+ wait_fixed_secs = 20 ,
963+ stop_max_delay_secs = 300 ,
964+ ):
954965 """Wait for static and dynamic nodes to be reset."""
955966 if static_nodes :
956967 logging .info ("Assert static nodes are placed in DOWN during replacement" )
957968 # DRAIN+DOWN = drained
958969 wait_for_compute_nodes_states (
959- scheduler_commands , static_nodes , expected_states = ["down" , "down*" , "drained" , "drained*" ]
970+ scheduler_commands ,
971+ static_nodes ,
972+ expected_states = ["down" , "down*" , "drained" , "drained*" ],
973+ wait_fixed_secs = wait_fixed_secs ,
974+ stop_max_delay_secs = stop_max_delay_secs ,
960975 )
961976 logging .info ("Assert static nodes are replaced" )
962- wait_for_compute_nodes_states (scheduler_commands , static_nodes , expected_states = ["idle" ])
977+ wait_for_compute_nodes_states (
978+ scheduler_commands ,
979+ static_nodes ,
980+ expected_states = ["idle" ],
981+ wait_fixed_secs = wait_fixed_secs ,
982+ stop_max_delay_secs = stop_max_delay_secs ,
983+ )
963984 # dynamic nodes are power saved after SuspendTimeout. static_nodes must be checked first
964985 if dynamic_nodes :
965986 logging .info ("Assert dynamic nodes are power saved" )
966- wait_for_compute_nodes_states (scheduler_commands , dynamic_nodes , expected_states = ["idle~" ])
987+ wait_for_compute_nodes_states (
988+ scheduler_commands ,
989+ dynamic_nodes ,
990+ expected_states = ["idle~" ],
991+ wait_fixed_secs = wait_fixed_secs ,
992+ stop_max_delay_secs = stop_max_delay_secs ,
993+ )
967994 node_addr_host = scheduler_commands .get_node_addr_host ()
968995 _assert_node_addr_host_reset (node_addr_host , dynamic_nodes )
969996
0 commit comments