Skip to content

Commit 30ff88b

Browse files
hehe7318gmarciani
authored andcommitted
[Login Nodes] Add lifecycle hook for node launch in login nodes ASG
Signed-off-by: Giacomo Marciani <[email protected]>
1 parent 8ab2c74 commit 30ff88b

File tree

4 files changed

+93
-20
lines changed

4 files changed

+93
-20
lines changed

cli/src/pcluster/resources/login_node/user_data.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,5 +216,9 @@ else
216216
timeout ${Timeout} /tmp/bootstrap.sh || error_exit
217217
fi
218218

219+
# Notify the AutoScalingGroup about the successful bootstrap
220+
IMDS_TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 300")
221+
INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" -v http://169.254.169.254/latest/meta-data/instance-id)
222+
aws autoscaling complete-lifecycle-action --auto-scaling-group-name "${AutoScalingGroupName}" --lifecycle-hook-name "${LaunchingLifecycleHookName}" --instance-id "$INSTANCE_ID" --lifecycle-action-result CONTINUE --region "${AWS::Region}"
219223
# End of file
220224
--==BOUNDARY==

cli/src/pcluster/templates/cdk_builder_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -848,7 +848,9 @@ def __init__(
848848
node: Union[HeadNode, BaseQueue, LoginNodesPool],
849849
shared_storage_infos: dict,
850850
name: str,
851+
auto_scaling_group_name: str,
851852
):
853+
self._auto_scaling_group_name = auto_scaling_group_name
852854
super().__init__(scope, id, config, node, shared_storage_infos, name)
853855

854856
def _build_policy(self) -> List[iam.PolicyStatement]:
@@ -872,6 +874,19 @@ def _build_policy(self) -> List[iam.PolicyStatement]:
872874
)
873875
],
874876
),
877+
iam.PolicyStatement(
878+
sid="Autoscaling",
879+
actions=[
880+
"autoscaling:CompleteLifecycleAction",
881+
],
882+
effect=iam.Effect.ALLOW,
883+
resources=[
884+
self._format_arn(
885+
service="autoscaling",
886+
resource=f"autoScalingGroupName/{self._auto_scaling_group_name}",
887+
)
888+
],
889+
),
875890
]
876891

877892

cli/src/pcluster/templates/login_nodes_stack.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def _add_login_node_iam_resources(self):
8383
self._pool,
8484
self._shared_storage_infos,
8585
self._pool.name,
86+
f"{self._login_nodes_stack_id}-AutoScalingGroup",
8687
)
8788
self._instance_profile = self._iam_resource.instance_profile
8889
self._instance_role = self._iam_resource.instance_role
@@ -193,6 +194,10 @@ def _add_login_nodes_pool_launch_template(self):
193194
NODE_BOOTSTRAP_TIMEOUT,
194195
)
195196
),
197+
"AutoScalingGroupName": f"{self._login_nodes_stack_id}-AutoScalingGroup",
198+
"LaunchingLifecycleHookName": (
199+
f"{self._login_nodes_stack_id}-LoginNodesLaunchingLifecycleHook"
200+
),
196201
},
197202
**get_common_user_data_env(self._pool, self._config),
198203
},
@@ -225,27 +230,40 @@ def _add_login_nodes_pool_auto_scaling_group(self):
225230
auto_scaling_group = autoscaling.CfnAutoScalingGroup(
226231
self,
227232
f"{self._login_nodes_stack_id}-AutoScalingGroup",
233+
auto_scaling_group_name=f"{self._login_nodes_stack_id}-AutoScalingGroup",
228234
launch_template=launch_template_specification,
229235
min_size=str(self._pool.count),
230236
max_size=str(self._pool.count),
231237
desired_capacity=str(self._pool.count),
232238
target_group_arns=[self._login_nodes_pool_target_group.node.default_child.ref],
233239
vpc_zone_identifier=self._pool.networking.subnet_ids,
234240
)
235-
236-
self._add_lifecycle_hook(auto_scaling_group)
241+
self.terminating_lifecycle_hook = self._add_terminating_lifecycle_hook(auto_scaling_group)
242+
self.launching_lifecycle_hook = self._add_launching_lifecycle_hook(auto_scaling_group)
237243

238244
return auto_scaling_group
239245

240-
def _add_lifecycle_hook(self, auto_scaling_group):
246+
def _add_terminating_lifecycle_hook(self, auto_scaling_group):
241247
return autoscaling.CfnLifecycleHook(
242248
self,
243-
"LoginNodesASGLifecycleHook",
249+
"LoginNodesASGLifecycleHookTerminating",
244250
auto_scaling_group_name=auto_scaling_group.ref,
245251
lifecycle_transition="autoscaling:EC2_INSTANCE_TERMINATING",
252+
lifecycle_hook_name=f"{self._login_nodes_stack_id}-LoginNodesTerminatingLifecycleHook",
246253
heartbeat_timeout=self._pool.gracetime_period * 60,
247254
)
248255

256+
def _add_launching_lifecycle_hook(self, auto_scaling_group):
257+
return autoscaling.CfnLifecycleHook(
258+
self,
259+
"LoginNodesASGLifecycleHookLaunching",
260+
auto_scaling_group_name=auto_scaling_group.ref,
261+
lifecycle_hook_name=f"{self._login_nodes_stack_id}-LoginNodesLaunchingLifecycleHook",
262+
lifecycle_transition="autoscaling:EC2_INSTANCE_LAUNCHING",
263+
default_result="ABANDON",
264+
heartbeat_timeout=600,
265+
)
266+
249267
def _add_login_nodes_pool_target_group(self):
250268
return elbv2.NetworkTargetGroup(
251269
self,
@@ -321,7 +339,7 @@ def _add_resources(self):
321339
for pool in self._login_nodes.pools:
322340
pool_construct = Pool(
323341
self,
324-
f"Pool{pool.name}",
342+
f"{self._config.cluster_name}-{pool.name}",
325343
pool,
326344
self._config,
327345
self._log_group,

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,8 @@ def assert_lifecycle_hook_properties(self, template, resource_name: str):
530530
assert resource["Type"] == "AWS::AutoScaling::LifecycleHook"
531531
properties = resource["Properties"]
532532
assert properties["LifecycleTransition"] == self.expected_lifecycle_transition
533-
assert properties["HeartbeatTimeout"] == self.expected_heartbeat_timeout
533+
if "HeartbeatTimeout" in properties:
534+
assert properties["HeartbeatTimeout"] == self.expected_heartbeat_timeout
534535

535536

536537
class IamRoleAssertion:
@@ -570,6 +571,10 @@ def assert_iam_policy_properties(self, template, resource_name: str):
570571
expected_lifecycle_transition="autoscaling:EC2_INSTANCE_TERMINATING",
571572
expected_heartbeat_timeout=7200,
572573
),
574+
LifecycleHookAssertion(
575+
expected_lifecycle_transition="autoscaling:EC2_INSTANCE_LAUNCHING",
576+
expected_heartbeat_timeout=600,
577+
),
573578
IamRoleAssertion(expected_managed_policy_arn="arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"),
574579
IamPolicyAssertion(
575580
expected_statements=[
@@ -596,6 +601,25 @@ def assert_iam_policy_properties(self, template, resource_name: str):
596601
},
597602
"Sid": "S3GetObj",
598603
},
604+
{
605+
"Action": "autoscaling:CompleteLifecycleAction",
606+
"Effect": "Allow",
607+
"Resource": {
608+
"Fn::Join": [
609+
"",
610+
[
611+
"arn:",
612+
{"Ref": "AWS::Partition"},
613+
":autoscaling:",
614+
{"Ref": "AWS::Region"},
615+
":",
616+
{"Ref": "AWS::AccountId"},
617+
":autoScalingGroupName/clustername-testloginnodespool1-AutoScalingGroup",
618+
],
619+
]
620+
},
621+
"Sid": "Autoscaling",
622+
},
599623
]
600624
),
601625
],
@@ -615,21 +639,25 @@ def test_login_nodes_traffic_management_resources_values_properties(
615639
)
616640

617641
asset_content_asg = get_asset_content_with_resource_name(
618-
cdk_assets, "Pooltestloginnodespool1Pooltestloginnodespool1AutoScalingGroup41053D91"
642+
cdk_assets, "clusternametestloginnodespool1clusternametestloginnodespool1AutoScalingGroup5EBA3937"
619643
)
620644
asset_content_nlb = get_asset_content_with_resource_name(
621-
cdk_assets, "Pooltestloginnodespool1testloginnodespool1LoadBalancer18C3DA82"
645+
cdk_assets, "clusternametestloginnodespool1testloginnodespool1LoadBalancerE1D4FCC7"
622646
)
623647
asset_content_target_group = get_asset_content_with_resource_name(
624-
cdk_assets, "Pooltestloginnodespool1testloginnodespool1TargetGroupD150DBF2"
648+
cdk_assets, "clusternametestloginnodespool1testloginnodespool1TargetGroup713F5EC5"
625649
)
626650
asset_content_nlb_listener = get_asset_content_with_resource_name(
627651
cdk_assets,
628-
"Pooltestloginnodespool1testloginnodespool1LoadBalancerLoginNodesListenertestloginnodespool1727E619B",
652+
"clusternametestloginnodespool1testloginnodespool1LoadBalancerLoginNodesListenertestloginnodespool165B4D3DC",
653+
)
654+
asset_content_lifecycle_hook_terminating = get_asset_content_with_resource_name(
655+
cdk_assets,
656+
"clusternametestloginnodespool1LoginNodesASGLifecycleHookTerminating51CA6203",
629657
)
630-
asset_content_lifecycle_hook = get_asset_content_with_resource_name(
658+
asset_content_lifecycle_hook_launching = get_asset_content_with_resource_name(
631659
cdk_assets,
632-
"Pooltestloginnodespool1LoginNodesASGLifecycleHookE54B2467",
660+
"clusternametestloginnodespool1LoginNodesASGLifecycleHookLaunching879DBA56",
633661
)
634662
asset_content_iam_role = get_asset_content_with_resource_name(
635663
cdk_assets,
@@ -639,29 +667,37 @@ def test_login_nodes_traffic_management_resources_values_properties(
639667
cdk_assets,
640668
"ParallelClusterPoliciesA50bdea9651dc48c",
641669
)
642-
print(cdk_assets)
643670
for lt_assertion in lt_assertions:
644671
if isinstance(lt_assertion, AutoScalingGroupAssertion):
645672
lt_assertion.assert_asg_properties(
646-
asset_content_asg, "Pooltestloginnodespool1Pooltestloginnodespool1AutoScalingGroup41053D91"
673+
asset_content_asg,
674+
"clusternametestloginnodespool1clusternametestloginnodespool1AutoScalingGroup5EBA3937",
647675
)
648676
elif isinstance(lt_assertion, NetworkLoadBalancerAssertion):
649677
lt_assertion.assert_nlb_properties(
650-
asset_content_nlb, "Pooltestloginnodespool1testloginnodespool1LoadBalancer18C3DA82"
678+
asset_content_nlb, "clusternametestloginnodespool1testloginnodespool1LoadBalancerE1D4FCC7"
651679
)
652680
elif isinstance(lt_assertion, TargetGroupAssertion):
653681
lt_assertion.assert_tg_properties(
654-
asset_content_target_group, "Pooltestloginnodespool1testloginnodespool1TargetGroupD150DBF2"
682+
asset_content_target_group, "clusternametestloginnodespool1testloginnodespool1TargetGroup713F5EC5"
655683
)
656684
elif isinstance(lt_assertion, NetworkLoadBalancerListenerAssertion):
657685
lt_assertion.assert_nlb_listener_properties(
658686
asset_content_nlb_listener,
659-
"Pooltestloginnodespool1testloginnodespool1LoadBalancerLoginNodesListenertestloginnodespool1727E619B",
687+
"clusternametestloginnodespool1testloginnodespool1"
688+
"LoadBalancerLoginNodesListenertestloginnodespool165B4D3DC",
660689
)
661690
elif isinstance(lt_assertion, LifecycleHookAssertion):
662-
lt_assertion.assert_lifecycle_hook_properties(
663-
asset_content_lifecycle_hook, "Pooltestloginnodespool1LoginNodesASGLifecycleHookE54B2467"
664-
)
691+
if lt_assertion.expected_lifecycle_transition == "autoscaling:EC2_INSTANCE_TERMINATING":
692+
lt_assertion.assert_lifecycle_hook_properties(
693+
asset_content_lifecycle_hook_terminating,
694+
"clusternametestloginnodespool1LoginNodesASGLifecycleHookTerminating51CA6203",
695+
)
696+
else:
697+
lt_assertion.assert_lifecycle_hook_properties(
698+
asset_content_lifecycle_hook_launching,
699+
"clusternametestloginnodespool1LoginNodesASGLifecycleHookLaunching879DBA56",
700+
)
665701
elif isinstance(lt_assertion, IamRoleAssertion):
666702
lt_assertion.assert_iam_role_properties(asset_content_iam_role, "RoleA50bdea9651dc48c")
667703
elif isinstance(lt_assertion, IamPolicyAssertion):

0 commit comments

Comments
 (0)