Skip to content

Commit b4acceb

Browse files
committed
[Performance] Add chef attribute cluster/cfnhup_on_fleet_enabled to disable cfn-hup on compute and login nodes.
1 parent 2eace6f commit b4acceb

File tree

11 files changed

+161
-5
lines changed

11 files changed

+161
-5
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@ aws-parallelcluster-cookbook CHANGELOG
33

44
This file is used to list changes made in each version of the AWS ParallelCluster cookbook.
55

6+
3.15.0
7+
------
8+
9+
**CHANGES**
10+
1. Add chef attribute `cluster/cfnhup_on_fleet_enabled` to disable cfn-hup on compute and login nodes.
11+
12+
613
3.14.0
714
------
815

cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
dcv_port: node['cluster']['dcv_port'],
3333
dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'],
3434
dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'],
35-
dcv_auth_user: node['cluster']['dcv']['authenticator']['user']
35+
dcv_auth_user: node['cluster']['dcv']['authenticator']['user'],
36+
cfnhup_enabled: cfnhup_enabled?
3637
)
3738
end

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,28 @@
5757
end
5858
end
5959

60+
context "when head node and cfn-hup disabled on fleet" do
61+
cached(:chef_run) do
62+
runner = runner(platform: platform, version: version) do |node|
63+
node.override['cluster']['node_type'] = 'HeadNode'
64+
node.override['cluster']['dcv_enabled'] = 'head_node'
65+
node.override['cluster']['cfnhup_on_fleet_enabled'] = 'false'
66+
allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true)
67+
end
68+
runner.converge(described_recipe)
69+
end
70+
cached(:node) { chef_run.node }
71+
72+
it 'has the correct content' do
73+
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
74+
.with_content("[program:cfn-hup]")
75+
.with_content("[program:clustermgtd]")
76+
.with_content("[program:clusterstatusmgtd]")
77+
.with_content("[program:pcluster_dcv_authenticator]")
78+
.with_content("--port 8444")
79+
end
80+
end
81+
6082
context "when compute fleet" do
6183
cached(:chef_run) do
6284
runner = runner(platform: platform, version: version) do |node|
@@ -77,6 +99,25 @@
7799
.with_content("[program:pcluster_dcv_authenticator]")
78100
end
79101
end
102+
103+
context "when compute fleet with cfn-hup disabled on fleet" do
104+
cached(:chef_run) do
105+
runner = runner(platform: platform, version: version) do |node|
106+
node.override['cluster']['node_type'] = 'ComputeFleet'
107+
node.override['cluster']['cfnhup_on_fleet_enabled'] = 'false'
108+
end
109+
runner.converge(described_recipe)
110+
end
111+
cached(:node) { chef_run.node }
112+
113+
it 'has the correct content' do
114+
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
115+
.with_content("[program:computemgtd]")
116+
117+
is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
118+
.with_content("[program:cfn-hup]")
119+
end
120+
end
80121
context "when login node and dcv configured" do
81122
cached(:chef_run) do
82123
runner = runner(platform: platform, version: version) do |node|
@@ -109,12 +150,32 @@
109150

110151
it 'has the correct content' do
111152
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
153+
.with_content("[program:cfn-hup]")
112154
.with_content("[program:loginmgtd]")
113155

114156
is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
115157
.with_content("[program:pcluster_dcv_authenticator]")
116158
end
117159
end
160+
161+
context "when login node with cfn-hup disabled on fleet" do
162+
cached(:chef_run) do
163+
runner = runner(platform: platform, version: version) do |node|
164+
node.override['cluster']['node_type'] = 'LoginNode'
165+
node.override['cluster']['cfnhup_on_fleet_enabled'] = 'false'
166+
end
167+
runner.converge(described_recipe)
168+
end
169+
cached(:node) { chef_run.node }
170+
171+
it 'has the correct content' do
172+
is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
173+
.with_content("[program:loginmgtd]")
174+
175+
is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf')
176+
.with_content("[program:cfn-hup]")
177+
end
178+
end
118179
end
119180
end
120181
end

cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%>
22
# Local modifications could be overwritten.
33
<%# HeadNode, ComputeFleet, LoginNode -%>
4-
<% case node['cluster']['node_type'] -%>
5-
<% when 'HeadNode', 'ComputeFleet', 'LoginNode' -%>
4+
<% if @cfnhup_enabled -%>
65
[program:cfn-hup]
76
command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh
87
autorestart = true

cookbooks/aws-parallelcluster-shared/attributes/cluster.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@
3434

3535
# Default NFS mount options
3636
default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime'
37+
38+
# Cluster Updates
39+
default['cluster']['cfnhup_on_fleet_enabled'] = 'true'

cookbooks/aws-parallelcluster-shared/libraries/helpers.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,14 @@ def wait_sync_file(path)
106106
timeout 5
107107
end
108108
end
109+
110+
def cfnhup_enabled?
111+
# cfn-hup is always enabled on the head node, as it is required to perform cluster updates.
112+
# cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that
113+
# live updates on compute and login nodes are not possible.
114+
node['cluster']['node_type'] == 'HeadNode' || node['cluster']['cfnhup_on_fleet_enabled'] == 'true'
115+
end
116+
117+
def cluster_readiness_check_enabled?
118+
node['cluster']['cfnhup_on_fleet_enabled'] == 'true'
119+
end
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
require 'spec_helper'
2+
3+
describe 'cfnhup_enabled?' do
4+
let(:node) { Chef::Node.new }
5+
6+
context 'when node type is HeadNode' do
7+
before { node.override['cluster']['node_type'] = 'HeadNode' }
8+
9+
it 'returns true regardless of cfnhup_on_fleet_enabled setting' do
10+
node.override['cluster']['cfnhup_on_fleet_enabled'] = 'false'
11+
expect(cfnhup_enabled?).to be true
12+
end
13+
end
14+
15+
%w(ComputeFleet LoginNode).each do |node_type|
16+
context "when node type is #{node_type}" do
17+
before { node.override['cluster']['node_type'] = node_type }
18+
19+
it 'returns true when cfnhup_on_fleet_enabled is true' do
20+
node.override['cluster']['cfnhup_on_fleet_enabled'] = 'true'
21+
expect(cfnhup_enabled?).to be true
22+
end
23+
24+
it 'returns false when cfnhup_on_fleet_enabled is false' do
25+
node.override['cluster']['cfnhup_on_fleet_enabled'] = 'false'
26+
expect(cfnhup_enabled?).to be false
27+
end
28+
end
29+
end
30+
end
31+
32+
describe 'cluster_readiness_check_enabled?' do
33+
let(:node) { Chef::Node.new }
34+
35+
[true, false].each do |cfnhup_on_fleet_enabled|
36+
it "returns #{cfnhup_on_fleet_enabled} when cfnhup_on_fleet_enabled is #{cfnhup_on_fleet_enabled}" do
37+
node.override['cluster']['cfnhup_on_fleet_enabled'] = cfnhup_on_fleet_enabled.to_s
38+
expect(cluster_readiness_check_enabled?).to be cfnhup_on_fleet_enabled
39+
end
40+
end
41+
end

cookbooks/aws-parallelcluster-slurm/recipes/finalize/finalize_head_node.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@
2424

2525
wait_static_fleet_running
2626

27-
wait_cluster_ready
27+
wait_cluster_ready if cluster_readiness_check_enabled?

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def update_nodes_in_queue(strategy, queues)
272272

273273
chef_sleep '15'
274274

275-
wait_cluster_ready
275+
wait_cluster_ready if cluster_readiness_check_enabled?
276276

277277
execute 'start clustermgtd' do
278278
command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd"

cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/finalize_head_node_spec.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
cached(:chef_run) do
2626
runner = runner(platform: platform, version: version) do |node|
2727
allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path)
28+
allow_any_instance_of(Object).to receive(:cluster_readiness_check_enabled?).and_return(true)
2829
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
2930

3031
node.override['cluster']['stack_name'] = cluster_name
@@ -52,6 +53,16 @@
5253
retry_delay: 90
5354
)
5455
end
56+
57+
context 'when cluster readiness check is disabled' do
58+
it 'does not check cluster readiness' do
59+
runner = runner(platform: platform, version: version) do |_node|
60+
allow_any_instance_of(Object).to receive(:cluster_readiness_check_enabled?).and_return(false)
61+
end
62+
chef_run = runner.converge(described_recipe)
63+
expect(chef_run).not_to run_execute("Check cluster readiness")
64+
end
65+
end
5566
end
5667
end
5768
end

0 commit comments

Comments
 (0)