@@ -222,6 +222,87 @@ def test_slurm_scaling(
222222 assert_no_errors_in_logs (remote_command_executor , scheduler )
223223
224224
225+ @pytest .mark .usefixtures ("os" , "instance" , "scheduler" )
226+ @pytest .mark .slurm_scaling
227+ def test_slurm_custom_partitions (
228+ region , pcluster_config_reader , s3_bucket_factory , clusters_factory , test_datadir , scheduler_commands_factory
229+ ):
230+ """Test that slurm-specific scaling logic is behaving as expected for normal actions and failures."""
231+ bucket_name = s3_bucket_factory ()
232+ bucket = boto3 .resource ("s3" , region_name = region ).Bucket (bucket_name )
233+ bucket .upload_file (str (test_datadir / "preinstall.sh" ), "scripts/preinstall.sh" )
234+ custom_partitions = ["CustomerPartition1" , "CustomerPartition2" ]
235+ cluster_config = pcluster_config_reader (bucket = bucket_name )
236+ cluster = clusters_factory (cluster_config )
237+ remote_command_executor = RemoteCommandExecutor (cluster )
238+ scheduler_commands = scheduler_commands_factory (remote_command_executor )
239+
240+ logging .info ("Checking number of instances..." )
241+ static_nodes = list (set (scheduler_commands .get_compute_nodes ()))
242+ assert_that (static_nodes ).is_length (3 )
243+ assert_num_instances_in_cluster (cluster .name , region , len (static_nodes ))
244+ logging .info (
245+ f"Setting { custom_partitions [0 ]} to inactive to verify pcluster nodes in the partition are not brought down..."
246+ )
247+ scheduler_commands .set_partition_state (custom_partitions [0 ], "INACTIVE" )
248+ logging .info ("Terminating cluster EC2 instances to check cluster can recover the nodes without overscaling..." )
249+ _terminate_nodes_manually (get_compute_nodes_instance_ids (cluster .name , region ), region )
250+ # Assert that cluster replaced static node and reset dynamic nodes
251+ _wait_for_node_reset (scheduler_commands , static_nodes , [])
252+ assert_num_instances_in_cluster (cluster .name , region , len (static_nodes ))
253+ logging .info (f"Setting { custom_partitions [0 ]} to active..." )
254+ scheduler_commands .set_partition_state (custom_partitions [0 ], "UP" )
255+
256+ logging .info ("Decreasing protected failure count for quicker enter protected mode..." )
257+ clustermgtd_conf_path = _retrieve_clustermgtd_conf_path (remote_command_executor )
258+ _set_protected_failure_count (remote_command_executor , 2 , clustermgtd_conf_path )
259+ failing_partition = "ondemand1"
260+ logging .info ("Testing protected mode is skipped while job running and activated when no jobs are in the queue..." )
261+ pending_job_id = _test_active_job_running (
262+ scheduler_commands ,
263+ remote_command_executor ,
264+ running_partition = custom_partitions [0 ],
265+ failing_partition = failing_partition ,
266+ )
267+ _check_protected_mode_message_in_log (remote_command_executor )
268+ check_status (cluster , compute_fleet_status = "PROTECTED" )
269+ _wait_for_partition_state_changed (scheduler_commands , failing_partition , "INACTIVE" )
270+ logging .info (
271+ "Checking paritition other than the failing partition is active. "
272+ "i.e. custom partitions are not managed by protected mode..."
273+ )
274+ all_partitions = scheduler_commands .get_partitions ()
275+ for partition in all_partitions :
276+ if partition != failing_partition :
277+ assert_that (scheduler_commands .get_partition_state (partition = partition )).is_equal_to ("UP" )
278+ scheduler_commands .cancel_job (pending_job_id )
279+
280+ logging .info ("Checking pcluster stop..." )
281+ cluster .stop ()
282+ logging .info ("Checking all pcluster cluster EC2 instances are terminated..." )
283+ wait_for_num_instances_in_cluster (cluster .name , region , 0 )
284+ logging .info ("Checking pcluster stop does not manage custom partitions..." )
285+ for partition in all_partitions :
286+ if partition in custom_partitions :
287+ expected_state = "UP"
288+ else :
289+ expected_state = "INACTIVE"
290+ assert_that (scheduler_commands .get_partition_state (partition = partition )).is_equal_to (expected_state )
291+
292+ logging .info ("Checking pcluster start..." )
293+ for partition in custom_partitions :
294+ scheduler_commands .set_partition_state (partition , "INACTIVE" )
295+ cluster .start ()
296+ wait_for_num_instances_in_cluster (cluster .name , region , len (static_nodes ))
297+ logging .info ("Checking pcluster start does not manage custom partitions..." )
298+ for partition in all_partitions :
299+ if partition in custom_partitions :
300+ expected_state = "INACTIVE"
301+ else :
302+ expected_state = "UP"
303+ assert_that (scheduler_commands .get_partition_state (partition = partition )).is_equal_to (expected_state )
304+
305+
225306@pytest .mark .usefixtures ("region" , "os" , "instance" , "scheduler" )
226307@pytest .mark .slurm_error_handling
227308def test_error_handling (
@@ -294,7 +375,16 @@ def test_slurm_protected_mode(
294375 _test_disable_protected_mode (
295376 remote_command_executor , cluster , bucket_name , pcluster_config_reader , clustermgtd_conf_path
296377 )
297- pending_job_id = _test_active_job_running (scheduler_commands , remote_command_executor , clustermgtd_conf_path )
378+
379+ # Re-enable protected mode
380+ _enable_protected_mode (remote_command_executor , clustermgtd_conf_path )
381+ # Decrease protected failure count for quicker enter protected mode.
382+ _set_protected_failure_count (remote_command_executor , 2 , clustermgtd_conf_path )
383+
384+ partition = "half-broken"
385+ pending_job_id = _test_active_job_running (
386+ scheduler_commands , remote_command_executor , running_partition = partition , failing_partition = partition
387+ )
298388 _test_protected_mode (scheduler_commands , remote_command_executor , cluster )
299389 test_cluster_health_metric (["NoCorrespondingInstanceErrors" , "OnNodeStartRunErrors" ], cluster .cfn_name , region )
300390 _test_job_run_in_working_queue (scheduler_commands )
@@ -1724,24 +1814,35 @@ def _test_disable_protected_mode(
17241814 )
17251815
17261816
1727- def _test_active_job_running (scheduler_commands , remote_command_executor , clustermgtd_conf_path ):
1728- """Test cluster is not placed into protected mode when there is an active job running even reach threshold."""
1817+ def _test_active_job_running (scheduler_commands , remote_command_executor , running_partition , failing_partition ):
1818+ """
1819+ Test cluster is not placed into protected mode when there is an active job running even reach threshold.
1820+
1821+ running_partition and failing_partition should usually be the same. When slurm partitions are customized,
1822+ running_partition and failing_partition can be different as long as the running job is on nodes belonging to both
1823+ partitions.
1824+ """
17291825 # Submit a job to the queue contains broken nodes and normal node, submit the job to the normal node to test
17301826 # the queue will not be disabled if there's active job running.
17311827 cancel_job_id = scheduler_commands .submit_command_and_assert_job_accepted (
1732- submit_command_args = {"command" : "sleep 3000" , "nodes" : 1 , "partition" : "half-broken" , "constraint" : "c5.xlarge" }
1828+ submit_command_args = {
1829+ "command" : "sleep 3000" ,
1830+ "nodes" : 1 ,
1831+ "partition" : running_partition ,
1832+ "constraint" : "c5.xlarge" ,
1833+ }
17331834 )
17341835 # Wait for the job to run
17351836 scheduler_commands .wait_job_running (cancel_job_id )
17361837
1737- # Re-enable protected mode
1738- _enable_protected_mode (remote_command_executor , clustermgtd_conf_path )
1739- # Decrease protected failure count for quicker enter protected mode.
1740- _set_protected_failure_count (remote_command_executor , 2 , clustermgtd_conf_path )
1741-
17421838 # Submit a job to the problematic compute resource, so the protected_failure count will increase
17431839 job_id_pending = scheduler_commands .submit_command_and_assert_job_accepted (
1744- submit_command_args = {"command" : "sleep 60" , "nodes" : 2 , "partition" : "half-broken" , "constraint" : "c5.large" }
1840+ submit_command_args = {
1841+ "command" : "sleep 60" ,
1842+ "nodes" : 2 ,
1843+ "partition" : failing_partition ,
1844+ "constraint" : "c5.large" ,
1845+ }
17451846 )
17461847 # Check the threshold reach but partition will be still UP since there's active job running
17471848 retry (wait_fixed = seconds (20 ), stop_max_delay = minutes (7 ))(assert_lines_in_logs )(
@@ -1751,7 +1852,7 @@ def _test_active_job_running(scheduler_commands, remote_command_executor, cluste
17511852 "currently have jobs running, not disabling them" ,
17521853 ],
17531854 )
1754- assert_that (scheduler_commands .get_partition_state (partition = "half-broken" )).is_equal_to ("UP" )
1855+ assert_that (scheduler_commands .get_partition_state (partition = failing_partition )).is_equal_to ("UP" )
17551856 # Cancel the job
17561857 scheduler_commands .cancel_job (cancel_job_id )
17571858 return job_id_pending
@@ -1760,6 +1861,15 @@ def _test_active_job_running(scheduler_commands, remote_command_executor, cluste
17601861def _test_protected_mode (scheduler_commands , remote_command_executor , cluster ):
17611862 """Test cluster will be placed into protected mode when protected count reach threshold and no job running."""
17621863 # See if the cluster can be put into protected mode when there's no job running after reaching threshold
1864+ _check_protected_mode_message_in_log (remote_command_executor )
1865+ # Assert bootstrap failure queues are inactive and compute fleet status is PROTECTED
1866+ check_status (cluster , compute_fleet_status = "PROTECTED" )
1867+ assert_that (scheduler_commands .get_partition_state (partition = "normal" )).is_equal_to ("UP" )
1868+ _wait_for_partition_state_changed (scheduler_commands , "broken" , "INACTIVE" )
1869+ _wait_for_partition_state_changed (scheduler_commands , "half-broken" , "INACTIVE" )
1870+
1871+
1872+ def _check_protected_mode_message_in_log (remote_command_executor ):
17631873 retry (wait_fixed = seconds (20 ), stop_max_delay = minutes (7 ))(assert_lines_in_logs )(
17641874 remote_command_executor ,
17651875 ["/var/log/parallelcluster/clustermgtd" ],
@@ -1770,11 +1880,6 @@ def _test_protected_mode(scheduler_commands, remote_command_executor, cluster):
17701880 "is in power up state without valid backing instance" ,
17711881 ],
17721882 )
1773- # Assert bootstrap failure queues are inactive and compute fleet status is PROTECTED
1774- check_status (cluster , compute_fleet_status = "PROTECTED" )
1775- assert_that (scheduler_commands .get_partition_state (partition = "normal" )).is_equal_to ("UP" )
1776- _wait_for_partition_state_changed (scheduler_commands , "broken" , "INACTIVE" )
1777- _wait_for_partition_state_changed (scheduler_commands , "half-broken" , "INACTIVE" )
17781883
17791884
17801885def _test_job_run_in_working_queue (scheduler_commands ):
0 commit comments