Skip to content

Commit f736286

Browse files
hanwen-clusterjdeamicis
authored andcommitted
[Integ-test] test slurm with custom partitions
This new test do the following checks: 1. pcluster nodes in custom partition are not brought down when the partition is set to inactive 2. cluster recovers without over-scaling after terminating EC2 instances of static nodes belonging to multiple partition 3. protected mode only manages pcluster partitions 4. pcluster stop/start only manages pcluster partitions Signed-off-by: Hanwen <[email protected]>
1 parent c1cb723 commit f736286

File tree

4 files changed

+195
-16
lines changed

4 files changed

+195
-16
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ CHANGELOG
1212

1313
**CHANGES**
1414
- Assign Slurm dynamic nodes a priority (weight) of 1000 by default. This allows Slurm to prioritize idle static nodes over idle dynamic ones.
15+
- Make `aws-parallelcluster-node` daemons handle only ParallelCluster-managed Slurm partitions.
1516

1617
**BUG FIXES**
1718
- Add validation to `ScaledownIdletime` value, to prevent setting a value lower than `-1`.

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 121 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,87 @@ def test_slurm_scaling(
222222
assert_no_errors_in_logs(remote_command_executor, scheduler)
223223

224224

225+
@pytest.mark.usefixtures("os", "instance", "scheduler")
226+
@pytest.mark.slurm_scaling
227+
def test_slurm_custom_partitions(
228+
region, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir, scheduler_commands_factory
229+
):
230+
"""Test ParallelCluster node deamons manage only Slurm partitions specified in cluster configuration file."""
231+
bucket_name = s3_bucket_factory()
232+
bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
233+
bucket.upload_file(str(test_datadir / "preinstall.sh"), "scripts/preinstall.sh")
234+
custom_partitions = ["CustomerPartition1", "CustomerPartition2"]
235+
cluster_config = pcluster_config_reader(bucket=bucket_name)
236+
cluster = clusters_factory(cluster_config)
237+
remote_command_executor = RemoteCommandExecutor(cluster)
238+
scheduler_commands = scheduler_commands_factory(remote_command_executor)
239+
240+
logging.info("Checking number of instances...")
241+
static_nodes = list(set(scheduler_commands.get_compute_nodes()))
242+
assert_that(static_nodes).is_length(3)
243+
assert_num_instances_in_cluster(cluster.name, region, len(static_nodes))
244+
logging.info(
245+
f"Setting {custom_partitions[0]} to inactive to verify pcluster nodes in the partition are not brought down..."
246+
)
247+
scheduler_commands.set_partition_state(custom_partitions[0], "INACTIVE")
248+
logging.info("Terminating cluster EC2 instances to check cluster can recover the nodes without overscaling...")
249+
_terminate_nodes_manually(get_compute_nodes_instance_ids(cluster.name, region), region)
250+
# Assert that cluster replaced static node and reset dynamic nodes
251+
_wait_for_node_reset(scheduler_commands, static_nodes, [])
252+
assert_num_instances_in_cluster(cluster.name, region, len(static_nodes))
253+
logging.info(f"Setting {custom_partitions[0]} to active...")
254+
scheduler_commands.set_partition_state(custom_partitions[0], "UP")
255+
256+
logging.info("Decreasing protected failure count for quicker enter protected mode...")
257+
clustermgtd_conf_path = _retrieve_clustermgtd_conf_path(remote_command_executor)
258+
_set_protected_failure_count(remote_command_executor, 2, clustermgtd_conf_path)
259+
failing_partition = "ondemand1"
260+
logging.info("Testing protected mode is skipped while job running and activated when no jobs are in the queue...")
261+
pending_job_id = _test_active_job_running(
262+
scheduler_commands,
263+
remote_command_executor,
264+
running_partition=custom_partitions[0],
265+
failing_partition=failing_partition,
266+
)
267+
_check_protected_mode_message_in_log(remote_command_executor)
268+
check_status(cluster, compute_fleet_status="PROTECTED")
269+
_wait_for_partition_state_changed(scheduler_commands, failing_partition, "INACTIVE")
270+
logging.info(
271+
"Checking paritition other than the failing partition is active. "
272+
"i.e. custom partitions are not managed by protected mode..."
273+
)
274+
all_partitions = scheduler_commands.get_partitions()
275+
for partition in all_partitions:
276+
if partition != failing_partition:
277+
assert_that(scheduler_commands.get_partition_state(partition=partition)).is_equal_to("UP")
278+
scheduler_commands.cancel_job(pending_job_id)
279+
280+
logging.info("Checking pcluster stop...")
281+
cluster.stop()
282+
logging.info("Checking all pcluster cluster EC2 instances are terminated...")
283+
wait_for_num_instances_in_cluster(cluster.name, region, 0)
284+
logging.info("Checking pcluster stop does not manage custom partitions...")
285+
for partition in all_partitions:
286+
if partition in custom_partitions:
287+
expected_state = "UP"
288+
else:
289+
expected_state = "INACTIVE"
290+
assert_that(scheduler_commands.get_partition_state(partition=partition)).is_equal_to(expected_state)
291+
292+
logging.info("Checking pcluster start...")
293+
for partition in custom_partitions:
294+
scheduler_commands.set_partition_state(partition, "INACTIVE")
295+
cluster.start()
296+
wait_for_num_instances_in_cluster(cluster.name, region, len(static_nodes))
297+
logging.info("Checking pcluster start does not manage custom partitions...")
298+
for partition in all_partitions:
299+
if partition in custom_partitions:
300+
expected_state = "INACTIVE"
301+
else:
302+
expected_state = "UP"
303+
assert_that(scheduler_commands.get_partition_state(partition=partition)).is_equal_to(expected_state)
304+
305+
225306
@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
226307
@pytest.mark.slurm_error_handling
227308
def test_error_handling(
@@ -294,7 +375,16 @@ def test_slurm_protected_mode(
294375
_test_disable_protected_mode(
295376
remote_command_executor, cluster, bucket_name, pcluster_config_reader, clustermgtd_conf_path
296377
)
297-
pending_job_id = _test_active_job_running(scheduler_commands, remote_command_executor, clustermgtd_conf_path)
378+
379+
# Re-enable protected mode
380+
_enable_protected_mode(remote_command_executor, clustermgtd_conf_path)
381+
# Decrease protected failure count for quicker enter protected mode.
382+
_set_protected_failure_count(remote_command_executor, 2, clustermgtd_conf_path)
383+
384+
partition = "half-broken"
385+
pending_job_id = _test_active_job_running(
386+
scheduler_commands, remote_command_executor, running_partition=partition, failing_partition=partition
387+
)
298388
_test_protected_mode(scheduler_commands, remote_command_executor, cluster)
299389
test_cluster_health_metric(["NoCorrespondingInstanceErrors", "OnNodeStartRunErrors"], cluster.cfn_name, region)
300390
_test_job_run_in_working_queue(scheduler_commands)
@@ -1724,24 +1814,35 @@ def _test_disable_protected_mode(
17241814
)
17251815

17261816

1727-
def _test_active_job_running(scheduler_commands, remote_command_executor, clustermgtd_conf_path):
1728-
"""Test cluster is not placed into protected mode when there is an active job running even reach threshold."""
1817+
def _test_active_job_running(scheduler_commands, remote_command_executor, running_partition, failing_partition):
1818+
"""
1819+
Test cluster is not placed into protected mode when there is an active job running even reach threshold.
1820+
1821+
running_partition and failing_partition should usually be the same. When slurm partitions are customized,
1822+
running_partition and failing_partition can be different as long as the running job is on nodes belonging to both
1823+
partitions.
1824+
"""
17291825
# Submit a job to the queue contains broken nodes and normal node, submit the job to the normal node to test
17301826
# the queue will not be disabled if there's active job running.
17311827
cancel_job_id = scheduler_commands.submit_command_and_assert_job_accepted(
1732-
submit_command_args={"command": "sleep 3000", "nodes": 1, "partition": "half-broken", "constraint": "c5.xlarge"}
1828+
submit_command_args={
1829+
"command": "sleep 3000",
1830+
"nodes": 1,
1831+
"partition": running_partition,
1832+
"constraint": "c5.xlarge",
1833+
}
17331834
)
17341835
# Wait for the job to run
17351836
scheduler_commands.wait_job_running(cancel_job_id)
17361837

1737-
# Re-enable protected mode
1738-
_enable_protected_mode(remote_command_executor, clustermgtd_conf_path)
1739-
# Decrease protected failure count for quicker enter protected mode.
1740-
_set_protected_failure_count(remote_command_executor, 2, clustermgtd_conf_path)
1741-
17421838
# Submit a job to the problematic compute resource, so the protected_failure count will increase
17431839
job_id_pending = scheduler_commands.submit_command_and_assert_job_accepted(
1744-
submit_command_args={"command": "sleep 60", "nodes": 2, "partition": "half-broken", "constraint": "c5.large"}
1840+
submit_command_args={
1841+
"command": "sleep 60",
1842+
"nodes": 2,
1843+
"partition": failing_partition,
1844+
"constraint": "c5.large",
1845+
}
17451846
)
17461847
# Check the threshold reach but partition will be still UP since there's active job running
17471848
retry(wait_fixed=seconds(20), stop_max_delay=minutes(7))(assert_lines_in_logs)(
@@ -1751,7 +1852,7 @@ def _test_active_job_running(scheduler_commands, remote_command_executor, cluste
17511852
"currently have jobs running, not disabling them",
17521853
],
17531854
)
1754-
assert_that(scheduler_commands.get_partition_state(partition="half-broken")).is_equal_to("UP")
1855+
assert_that(scheduler_commands.get_partition_state(partition=failing_partition)).is_equal_to("UP")
17551856
# Cancel the job
17561857
scheduler_commands.cancel_job(cancel_job_id)
17571858
return job_id_pending
@@ -1760,6 +1861,15 @@ def _test_active_job_running(scheduler_commands, remote_command_executor, cluste
17601861
def _test_protected_mode(scheduler_commands, remote_command_executor, cluster):
17611862
"""Test cluster will be placed into protected mode when protected count reach threshold and no job running."""
17621863
# See if the cluster can be put into protected mode when there's no job running after reaching threshold
1864+
_check_protected_mode_message_in_log(remote_command_executor)
1865+
# Assert bootstrap failure queues are inactive and compute fleet status is PROTECTED
1866+
check_status(cluster, compute_fleet_status="PROTECTED")
1867+
assert_that(scheduler_commands.get_partition_state(partition="normal")).is_equal_to("UP")
1868+
_wait_for_partition_state_changed(scheduler_commands, "broken", "INACTIVE")
1869+
_wait_for_partition_state_changed(scheduler_commands, "half-broken", "INACTIVE")
1870+
1871+
1872+
def _check_protected_mode_message_in_log(remote_command_executor):
17631873
retry(wait_fixed=seconds(20), stop_max_delay=minutes(7))(assert_lines_in_logs)(
17641874
remote_command_executor,
17651875
["/var/log/parallelcluster/clustermgtd"],
@@ -1770,11 +1880,6 @@ def _test_protected_mode(scheduler_commands, remote_command_executor, cluster):
17701880
"is in power up state without valid backing instance",
17711881
],
17721882
)
1773-
# Assert bootstrap failure queues are inactive and compute fleet status is PROTECTED
1774-
check_status(cluster, compute_fleet_status="PROTECTED")
1775-
assert_that(scheduler_commands.get_partition_state(partition="normal")).is_equal_to("UP")
1776-
_wait_for_partition_state_changed(scheduler_commands, "broken", "INACTIVE")
1777-
_wait_for_partition_state_changed(scheduler_commands, "half-broken", "INACTIVE")
17781883

17791884

17801885
def _test_job_run_in_working_queue(scheduler_commands):
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
Image:
2+
Os: {{ os }}
3+
HeadNode:
4+
InstanceType: {{ instance }}
5+
Networking:
6+
SubnetId: {{ public_subnet_id }}
7+
Ssh:
8+
KeyName: {{ key_name }}
9+
Scheduling:
10+
Scheduler: {{ scheduler }}
11+
SlurmSettings:
12+
CustomSlurmSettings:
13+
- NodeSet: nodeset
14+
Nodes: "ondemand1-st-ondemand1-i1-[1-2],ondemand2-dy-ondemand2-c5large-[1-10]"
15+
- PartitionName: CustomerPartition1
16+
Nodes: nodeset
17+
- PartitionName: CustomerPartition2
18+
Nodes: nodeset
19+
SlurmQueues:
20+
- Name: ondemand1
21+
Networking:
22+
SubnetIds:
23+
- {{ private_subnet_id }}
24+
ComputeResources:
25+
- Name: ondemand1-c5large
26+
Instances:
27+
- InstanceType: c5.large
28+
- Name: ondemand1-i1
29+
Instances:
30+
- InstanceType: {{ instance }}
31+
MinCount: 2
32+
Iam:
33+
S3Access:
34+
- BucketName: {{ bucket }}
35+
CustomActions:
36+
OnNodeStart:
37+
# pre-install script to make c5.large instance type instance has bootstrap error
38+
Script: s3://{{ bucket }}/scripts/preinstall.sh
39+
- Name: ondemand2
40+
Networking:
41+
SubnetIds:
42+
- {{ private_subnet_id }}
43+
ComputeResources:
44+
- Name: ondemand2-c5large
45+
Instances:
46+
- InstanceType: c5.large
47+
- Name: ondemand2-i1
48+
Instances:
49+
- InstanceType: {{ instance }}
50+
MinCount: 1
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License").
5+
# You may not use this file except in compliance with the License.
6+
# A copy of the License is located at
7+
#
8+
# http://aws.amazon.com/apache2.0/
9+
#
10+
# or in the "LICENSE.txt" file accompanying this file.
11+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
12+
# See the License for the specific language governing permissions and limitations under the License.
13+
function get_instance_type() {
14+
token=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 300")
15+
instance_type_url="http://169.254.169.254/latest/meta-data/instance-type"
16+
instance_type=$(curl --retry 3 --retry-delay 0 --silent --fail -H "X-aws-ec2-metadata-token: ${token}" "${instance_type_url}")
17+
}
18+
get_instance_type
19+
if [ "${instance_type}" == "c5.large" ]; then
20+
echo "Test Bootstrap error that causes instance to self terminate."
21+
exit 1
22+
fi
23+

0 commit comments

Comments
 (0)