Skip to content

Commit eb6de2b

Browse files
Merge branch 'develop' into develop
2 parents be0d750 + be39fa3 commit eb6de2b

File tree

7 files changed

+483
-31
lines changed

7 files changed

+483
-31
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ CHANGELOG
33

44
3.x.x
55
-----
6+
**ENHANCEMENTS**
7+
- Add support for updating `SharedStorage` configuration.
68

79
**ENHANCEMENTS**
8-
- Add new configuration parameter `DeletionPolicy` for EFs and FSx for Lustre shared storage
10+
- Add new configuration parameter `DeletionPolicy` for EFS and FSx for Lustre shared storage
911
to support storage retention on deletion.
1012
- Enable server-side encryption for the EcrImageBuilder SNS topic created when deploying ParallelCluster API and used to notify on docker image build events.
1113
- Add support for on-demand capacity reservations.

cli/src/pcluster/config/update_policy.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from enum import Enum
1212

1313
from pcluster.config.cluster_config import QueueUpdateStrategy
14-
from pcluster.constants import DEFAULT_MAX_COUNT
14+
from pcluster.constants import AWSBATCH, DEFAULT_MAX_COUNT, SLURM
1515

1616

1717
class UpdatePolicy:
@@ -125,6 +125,39 @@ def is_slurm_queues_change(change):
125125
return any(path.startswith("SlurmQueues[") for path in change.path)
126126

127127

128+
def is_slurm_scheduler(patch):
129+
return patch.cluster.stack.scheduler == SLURM
130+
131+
132+
def is_awsbatch_scheduler(_, patch):
133+
return patch.cluster.stack.scheduler == AWSBATCH
134+
135+
136+
def is_stop_required_for_shared_storage(change):
137+
"""
138+
Cluster stop is required for unmount operation.
139+
140+
1. Remove managed/external EBS EFS and FSx sections, which indicates unmount operation.
141+
2. Change MountDir, which indicates unmount operation.
142+
"""
143+
if change.is_list and change.key == "SharedStorage" and change.old_value is not None and change.new_value is None:
144+
return True
145+
elif not change.is_list and change.key == "MountDir":
146+
return True
147+
return False
148+
149+
150+
def fail_reason_shared_storage_update_policy(change, patch):
151+
if is_awsbatch_scheduler(change, patch):
152+
return f"Update actions are not currently supported for the '{change.key}' parameter"
153+
reason = "All compute nodes must be stopped"
154+
# QueueUpdateStrategy can override UpdatePolicy of parameters under SlurmQueues
155+
if not is_stop_required_for_shared_storage(change) and is_slurm_scheduler(patch):
156+
reason += " or QueueUpdateStrategy must be set"
157+
158+
return reason
159+
160+
128161
def fail_reason_queue_update_strategy(change, _):
129162
reason = "All compute nodes must be stopped"
130163
# QueueUpdateStrategy can override UpdatePolicy of parameters under SlurmQueues
@@ -149,11 +182,48 @@ def condition_checker_queue_update_strategy(change, patch):
149182
return result
150183

151184

185+
def actions_needed_shared_storage_update(change, patch):
186+
if is_awsbatch_scheduler(change, patch):
187+
return (
188+
f"Restore '{change.key}' value to '{change.old_value}'"
189+
if change.old_value is not None
190+
else "{0} the parameter '{1}'".format("Restore" if change.is_list else "Remove", change.key)
191+
+ ". If you need this change, please consider creating a new cluster instead of updating the existing one."
192+
)
193+
actions = "Stop the compute fleet with the pcluster update-compute-fleet command"
194+
if not is_stop_required_for_shared_storage(change) and is_slurm_scheduler(patch):
195+
actions += ", or set QueueUpdateStrategy in the configuration used for the 'update-cluster' operation"
196+
197+
return actions
198+
199+
200+
def condition_checker_shared_storage_update_policy(change, patch):
201+
"""
202+
Check different requirements for different schedulers.
203+
204+
Compute fleet stop is required for plugin scheduler.
205+
Update for awsbatch scheduler is not supported.
206+
QueueUpdateStrategy can override UpdatePolicy of parameters under SlurmQueues for slurm scheduler.
207+
"""
208+
if is_awsbatch_scheduler(change, patch):
209+
return False
210+
result = not patch.cluster.has_running_capacity()
211+
if is_slurm_scheduler(patch) and not is_stop_required_for_shared_storage(change):
212+
result = (
213+
result
214+
or patch.target_config.get("Scheduling")
215+
.get("SlurmSettings", {})
216+
.get("QueueUpdateStrategy", QueueUpdateStrategy.COMPUTE_FLEET_STOP.value)
217+
!= QueueUpdateStrategy.COMPUTE_FLEET_STOP.value
218+
)
219+
220+
return result
221+
222+
152223
# Common fail_reason messages
153224
UpdatePolicy.FAIL_REASONS = {
154225
"ebs_volume_resize": "Updating the file system after a resize operation requires commands specific to your "
155226
"operating system.",
156-
"shared_storage_change": "Shared Storage cannot be added or removed during a 'pcluster update-cluster' operation",
157227
"cookbook_update": lambda change, patch: (
158228
"Updating cookbook related parameter is not supported because it only "
159229
"applies updates to compute nodes. If you still want to proceed, first stop the compute fleet with the "
@@ -169,6 +239,7 @@ def condition_checker_queue_update_strategy(change, patch):
169239
),
170240
"pcluster_stop": lambda change, patch: "Stop the compute fleet with the pcluster update-compute-fleet command",
171241
"pcluster_stop_conditional": actions_needed_queue_update_strategy,
242+
"shared_storage_update_conditional": actions_needed_shared_storage_update,
172243
}
173244

174245
# Base policies
@@ -220,6 +291,15 @@ def condition_checker_queue_update_strategy(change, patch):
220291
condition_checker=condition_checker_queue_update_strategy,
221292
)
222293

294+
# Update policy for updating SharedStorage
295+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY = UpdatePolicy(
296+
name="SHARED_STORAGE_UPDATE_POLICY",
297+
level=6 if not is_awsbatch_scheduler else 1000,
298+
fail_reason=fail_reason_shared_storage_update_policy,
299+
action_needed=UpdatePolicy.ACTIONS_NEEDED["shared_storage_update_conditional"],
300+
condition_checker=condition_checker_shared_storage_update_policy,
301+
)
302+
223303
# Update supported on new addition or on removal only with all compute nodes down
224304
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE = UpdatePolicy(
225305
name="COMPUTE_FLEET_STOP_ON_REMOVE",

cli/src/pcluster/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
DELETION_POLICIES_WITH_SNAPSHOT = DELETION_POLICIES + ["Snapshot"]
2626
SUPPORTED_ARCHITECTURES = ["x86_64", "arm64"]
2727
SUPPORTED_OSES_FOR_ARCHITECTURE = {"x86_64": SUPPORTED_OSES, "arm64": SUPPORTED_OSES}
28+
SLURM = "slurm"
29+
AWSBATCH = "awsbatch"
2830

2931
OS_MAPPING = {
3032
"centos7": {"user": "centos", "root-device": "/dev/sda1"},

cli/src/pcluster/schemas/cluster_schema.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -384,11 +384,13 @@ class FsxLustreSettingsSchema(BaseSchema):
384384
)
385385
per_unit_storage_throughput = fields.Int(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
386386
backup_id = fields.Str(
387-
validate=validate.Regexp("^(backup-[0-9a-f]{8,})$"), metadata={"update_policy": UpdatePolicy.UNSUPPORTED}
387+
validate=validate.Regexp("^(backup-[0-9a-f]{8,})$"),
388+
metadata={"update_policy": UpdatePolicy.UNSUPPORTED},
388389
)
389390
kms_key_id = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
390391
file_system_id = fields.Str(
391-
validate=validate.Regexp(r"^fs-[0-9a-z]{17}$"), metadata={"update_policy": UpdatePolicy.UNSUPPORTED}
392+
validate=validate.Regexp(r"^fs-[0-9a-z]{17}$"),
393+
metadata={"update_policy": UpdatePolicy.UNSUPPORTED},
392394
)
393395
auto_import_policy = fields.Str(
394396
validate=validate.OneOf(["NEW", "NEW_CHANGED", "NEW_CHANGED_DELETED"]),
@@ -469,7 +471,9 @@ class SharedStorageSchema(BaseSchema):
469471
"""Represent the generic SharedStorage schema."""
470472

471473
mount_dir = fields.Str(
472-
required=True, validate=get_field_validator("file_path"), metadata={"update_policy": UpdatePolicy.UNSUPPORTED}
474+
required=True,
475+
validate=get_field_validator("file_path"),
476+
metadata={"update_policy": UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY},
473477
)
474478
name = fields.Str(required=True, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
475479
storage_type = fields.Str(
@@ -1935,9 +1939,7 @@ class ClusterSchema(BaseSchema):
19351939
SharedStorageSchema,
19361940
many=True,
19371941
metadata={
1938-
"update_policy": UpdatePolicy(
1939-
UpdatePolicy.UNSUPPORTED, fail_reason=UpdatePolicy.FAIL_REASONS["shared_storage_change"]
1940-
),
1942+
"update_policy": UpdatePolicy(UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY),
19411943
"update_key": "Name",
19421944
},
19431945
)

cli/src/pcluster/templates/cdk_builder_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
491491
),
492492
iam.PolicyStatement(
493493
sid="Ec2TagsAndVolumes",
494-
actions=["ec2:AttachVolume", "ec2:CreateTags"],
494+
actions=["ec2:AttachVolume", "ec2:CreateTags", "ec2:DetachVolume"],
495495
effect=iam.Effect.ALLOW,
496496
resources=[
497497
self._format_arn(

cli/tests/pcluster/config/test_config_patch.py

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -423,15 +423,19 @@ def _test_less_target_sections(base_conf, target_conf):
423423
_get_storage_by_name(base_conf, "ebs1"),
424424
None,
425425
UpdatePolicy(
426-
UpdatePolicy.UNSUPPORTED,
427-
name="UNSUPPORTED",
428-
fail_reason=(
429-
"Shared Storage cannot be added or removed during a 'pcluster update-cluster' operation"
430-
),
426+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
427+
name="SHARED_STORAGE_UPDATE_POLICY",
431428
),
432429
is_list=True,
433430
),
434-
Change(["SharedStorage[ebs2]"], "MountDir", "vol2", "vol1", UpdatePolicy.UNSUPPORTED, is_list=False),
431+
Change(
432+
["SharedStorage[ebs2]"],
433+
"MountDir",
434+
"vol2",
435+
"vol1",
436+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
437+
is_list=False,
438+
),
435439
Change(["SharedStorage[ebs2]", "EbsSettings"], "Iops", None, 20, UpdatePolicy.SUPPORTED, is_list=False),
436440
Change(
437441
["SharedStorage[ebs2]", "EbsSettings"],
@@ -524,14 +528,18 @@ def _test_more_target_sections(base_conf, target_conf):
524528
None,
525529
_get_storage_by_name(target_conf, "ebs1"),
526530
UpdatePolicy(
527-
UpdatePolicy.UNSUPPORTED,
528-
fail_reason=(
529-
"Shared Storage cannot be added or removed during a 'pcluster update-cluster' operation"
530-
),
531+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
531532
),
532533
is_list=True,
533534
),
534-
Change(["SharedStorage[ebs2]"], "MountDir", "vol2", "vol1", UpdatePolicy.UNSUPPORTED, is_list=False),
535+
Change(
536+
["SharedStorage[ebs2]"],
537+
"MountDir",
538+
"vol2",
539+
"vol1",
540+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
541+
is_list=False,
542+
),
535543
Change(["SharedStorage[ebs2]", "EbsSettings"], "Iops", None, 20, UpdatePolicy.SUPPORTED, is_list=False),
536544
Change(
537545
["SharedStorage[ebs2]", "EbsSettings"],
@@ -588,10 +596,15 @@ def _test_incompatible_ebs_sections(base_conf, target_conf):
588596
target_conf,
589597
[
590598
Change(
591-
["SharedStorage[ebs1]"], "MountDir", "vol1", "new_value", UpdatePolicy(UpdatePolicy.UNSUPPORTED), False
599+
["SharedStorage[ebs1]"],
600+
"MountDir",
601+
"vol1",
602+
"new_value",
603+
UpdatePolicy(UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY),
604+
False,
592605
)
593606
],
594-
UpdatePolicy.UNSUPPORTED,
607+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
595608
)
596609

597610

@@ -615,9 +628,8 @@ def _test_different_names(base_conf, target_conf):
615628
assert_that(_get_storage_by_name(target_conf, "ebs1")).is_none()
616629
assert_that(_get_storage_by_name(target_conf, "ebs-")).is_none()
617630

618-
unsupported_update_policy = UpdatePolicy(
619-
UpdatePolicy.UNSUPPORTED,
620-
fail_reason="Shared Storage cannot be added or removed during a 'pcluster update-cluster' operation",
631+
shared_storage_update_policy = UpdatePolicy(
632+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
621633
)
622634

623635
# The patch should contain 5 differences:
@@ -627,12 +639,12 @@ def _test_different_names(base_conf, target_conf):
627639
base_conf,
628640
target_conf,
629641
[
630-
Change([], "SharedStorage", None, target_ebs_1_section, unsupported_update_policy, is_list=True),
631-
Change([], "SharedStorage", None, target_ebs_2_section, unsupported_update_policy, is_list=True),
632-
Change([], "SharedStorage", base_ebs_1_section, None, unsupported_update_policy, is_list=True),
633-
Change([], "SharedStorage", base_ebs_2_section, None, unsupported_update_policy, is_list=True),
642+
Change([], "SharedStorage", None, target_ebs_1_section, shared_storage_update_policy, is_list=True),
643+
Change([], "SharedStorage", None, target_ebs_2_section, shared_storage_update_policy, is_list=True),
644+
Change([], "SharedStorage", base_ebs_1_section, None, shared_storage_update_policy, is_list=True),
645+
Change([], "SharedStorage", base_ebs_2_section, None, shared_storage_update_policy, is_list=True),
634646
],
635-
UpdatePolicy.UNSUPPORTED,
647+
UpdatePolicy.SHARED_STORAGE_UPDATE_POLICY,
636648
)
637649

638650

0 commit comments

Comments
 (0)